diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f5385dc3e..e976f049f 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -4,24 +4,24 @@ "name": "NVIDIA Nemotron Team" }, "metadata": { - "description": "NVIDIA Nemotron AI stack plugins — pipeline builder, model knowledge bases, and contributor tools" + "description": "NVIDIA Nemotron AI stack plugins" }, "plugins": [ { - "name": "nemotron", - "source": "./plugins/nemotron", - "description": "NVIDIA Nemotron AI stack — pipeline builder and model knowledge bases", - "version": "0.3.0", + "name": "nemotron-customize", + "source": "./skills/nemotron-customize", + "description": "Compose runnable Nemotron model-customization pipelines from repo steps.", + "version": "0.1.0", "category": "ml-pipelines", - "keywords": ["nvidia", "nemotron", "training", "sft", "rl", "megatron", "models"] - }, - { - "name": "nemotron-dev", - "source": "./plugins/nemotron-dev", - "description": "Internal: contributor tools for Nemotron repo developers", - "version": "0.3.0", - "category": "developer-tools", - "keywords": ["nvidia", "nemotron", "internal", "contributing", "dev"] + "keywords": [ + "nvidia", + "nemotron", + "training", + "sft", + "rl", + "megatron", + "customization" + ] } ] } diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json deleted file mode 100644 index 46b16e537..000000000 --- a/.claude-plugin/plugin.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "name": "nemotron-customize", - "description": "Compose custom ML training pipelines from the NVIDIA AI stack", - "version": "0.1.0", - "author": { - "name": "NVIDIA Nemotron Team" - }, - "skills": [ - "./skills/" - ] -} diff --git a/.gitignore b/.gitignore index 130bd92b1..251fea2f1 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,7 @@ CLAUDE.md # Compiled config config.yaml main.py +src/nemotron/steps/_bootstrap/runtime/ # Documentation build docs/_build/ diff --git a/README.md b/README.md index 43da7e9fe..027c22817 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,36 @@ --- +## Use from Claude Code + +This repo ships a Claude Code plugin called **`nemotron-customize`** that turns the step catalog under [`src/nemotron/steps/`](./src/nemotron/steps/) into a guided, repo-native pipeline builder. + +Install once: + +```text +/plugin marketplace add NVIDIA/Nemotron +/plugin install nemotron-customize@nvidia-nemotron +``` + +Then, **start Claude Code from the repo root** and invoke the skill: + +```bash +cd /path/to/Nemotron # repo root: must contain pyproject.toml and src/nemotron/steps/ +claude +``` + +```text +/nemotron-customize +``` + +The skill resolves all file paths against your current working directory, so it must be invoked from the Nemotron checkout root. Running it from a subdirectory will cause file reads to fail. + +The skill plans the step DAG, validates artifact wiring, and emits the YAML configs needed to run the requested pipeline. See [`skills/nemotron-customize/SKILL.md`](./skills/nemotron-customize/SKILL.md) for the full contract. + +> The marketplace installs **only** `nemotron-customize`. The other folders under [`skills/`](./skills/) (model knowledge bases, contributor add-`*` skills) stay on disk for repo browsing but are not loaded as plugins. + +--- + ## Repository Overview ``` diff --git a/deploy/nemotron-customizer/airgap/.gitignore b/deploy/nemotron-customizer/airgap/.gitignore new file mode 100644 index 000000000..6ccaadce0 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/.gitignore @@ -0,0 +1,7 @@ +# Generated by airgap runner. +out/ +airgap-bundle/ +archives/ +__pycache__/ +*.lock.yaml +*.tar diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.execution b/deploy/nemotron-customizer/airgap/Dockerfile.execution new file mode 100644 index 000000000..acc9fb7bd --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.execution @@ -0,0 +1,52 @@ +# Derivative execution image for Nemotron Customizer airgap. +# Built from the real training/runtime image and only adds small missing +# wrapper packages. + +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +ARG EXECUTION_REQUIREMENTS +ARG REPO_OVERLAYS +ARG REPO_OVERLAYS_DIR +ARG PYTHON_BIN=python +ARG PIP_NO_DEPS=true + +ENV HF_HUB_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=1 +ENV HF_DATASETS_OFFLINE=1 +ENV WANDB_MODE=offline + +COPY ${EXECUTION_REQUIREMENTS} /opt/nemotron-airgap/execution-requirements.txt +COPY ${REPO_OVERLAYS} /opt/nemotron-airgap/repo-overlays.json +COPY ${REPO_OVERLAYS_DIR}/ /opt/nemotron-airgap/repo-overlays/ + +# Build-time installs keep --no-cache-dir so derivative image layers stay small. +RUN if [ -s /opt/nemotron-airgap/execution-requirements.txt ]; then \ + if [ "${PIP_NO_DEPS}" = "true" ]; then \ + ${PYTHON_BIN} -m pip install --no-cache-dir --no-deps -r /opt/nemotron-airgap/execution-requirements.txt; \ + else \ + ${PYTHON_BIN} -m pip install --no-cache-dir -r /opt/nemotron-airgap/execution-requirements.txt; \ + fi; \ + fi && \ + ${PYTHON_BIN} - <<'PY' +import json +import pathlib +import shutil + +root = pathlib.Path("/opt/nemotron-airgap/repo-overlays") +items = json.loads(pathlib.Path("/opt/nemotron-airgap/repo-overlays.json").read_text()) +for item in items: + repo = item["repo"] + source = item.get("source", repo) + target = pathlib.Path(item["target"]) + src = root / source + if not src.exists(): + raise SystemExit(f"missing baked repo overlay: {src}") + if target.exists() or target.is_symlink(): + if target.is_dir() and not target.is_symlink(): + shutil.rmtree(target) + else: + target.unlink() + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(src, target) +PY diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore new file mode 100644 index 000000000..9ec7d6457 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore @@ -0,0 +1,14 @@ +** + +!deploy +!deploy/nemotron-customizer +!deploy/nemotron-customizer/airgap +!deploy/nemotron-customizer/airgap/out +!deploy/nemotron-customizer/airgap/out/execution-context +!deploy/nemotron-customizer/airgap/out/execution-context/** +!deploy/nemotron-customizer/airgap/out/repo-overlays +!deploy/nemotron-customizer/airgap/out/repo-overlays/** + +**/.git +**/__pycache__ +**/*.pyc diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher b/deploy/nemotron-customizer/airgap/Dockerfile.launcher new file mode 100644 index 000000000..7d26315d5 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.launcher @@ -0,0 +1,30 @@ +# Launcher image for Nemotron Customizer airgap. +# It contains the repo and a uv-synced environment. It does not run training. + +ARG BASE_IMAGE=python:3.12-slim +FROM ${BASE_IMAGE} + +ARG UV_VERSION=0.11.1 + +WORKDIR /workspace/Nemotron + +ENV UV_LINK_MODE=copy +ENV UV_PYTHON_DOWNLOADS=never +ENV HF_HUB_OFFLINE=1 +ENV TRANSFORMERS_OFFLINE=1 +ENV HF_DATASETS_OFFLINE=1 +ENV WANDB_MODE=offline +ENV PYTHONPATH=/workspace/Nemotron/src +ENV PATH=/workspace/Nemotron/.venv/bin:$PATH + +RUN apt-get update && \ + apt-get install -y --no-install-recommends git ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +RUN python -m pip install --no-cache-dir "uv==${UV_VERSION}" + +COPY . . + +RUN uv sync --frozen --no-dev + +CMD ["bash"] diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore new file mode 100644 index 000000000..6cecc5520 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore @@ -0,0 +1,21 @@ +.git +.venv +.ruff_cache +.pytest_cache +**/__pycache__ +**/*.pyc + +/.nemo_run +/outputs +/output +/logs +/checkpoints +/wandb +/data +/downloads + +deploy/nemotron-customizer/airgap/out +deploy/nemotron-customizer/airgap/airgap-bundle +deploy/nemotron-customizer/airgap/archives +deploy/nemotron-customizer/airgap/*.tar +deploy/nemotron-customizer/airgap/*.lock.yaml diff --git a/deploy/nemotron-customizer/airgap/README.md b/deploy/nemotron-customizer/airgap/README.md new file mode 100644 index 000000000..718135790 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/README.md @@ -0,0 +1,135 @@ +# Nemotron Customizer Airgap + +This folder is scoped only to Nemotron Customizer steps under +`src/nemotron/steps/`. + +The flow is intentionally small: + +1. Build one **launcher image** with this repo and `uv.lock`. +2. Build one or more **execution images** by grouping selected workflow stages by base image. +3. Save those images as tarballs for the airgapped side. +4. Keep models, datasets, checkpoints, and customer files on persistent storage. + +Edit `airgap.yaml` first: + +- `workflow.stages`: the Nemotron Customizer steps the customer wants to run +- `dependencies`: central step dependency map, for example SFT training needs SFT packing +- `step_execution_images`: which execution image each step should use +- `execution_images`: the base image, output tag, and known/import-probed Python requirements + +Only steps reached from `workflow.stages` are built. Steps are grouped by +`base_image + repo_overlays`; each group gets one derivative image with the +union of its small missing packages. If two selected step families share the +same base image and repo overlays, the runner emits one combined execution image for +both. + +Run from the repo root: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml +``` + +That prints the plan. To actually pull/build/save images on the connected +machine: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --execute +``` + +To run only a few stages: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --stage validate \ + --stage discover-execution-deps +``` + +To override the workflow without editing YAML, pass one or more selected +Nemotron step targets. Dependencies are still expanded from `dependencies`. +For example, SDG plus SFT also adds `data_prep/sft_packing` because SFT needs packed +data: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --target sdg/data_designer:tiny \ + --target sft/megatron_bridge:tiny +``` + +Outputs are written under `deploy/nemotron-customizer/airgap/out/` by default: + +- `airgap-manifest.yaml`: what was validated and built +- `airgap-build-state.yaml`: incomplete execute run state used for resume +- `airgap-build-complete.yaml`: final execute run state after success +- `requirements-.txt`: small missing packages per execution image +- `repo-overlays-.json`: git auto-mounts discovered from selected step configs +- `launcher-image.tar` +- `execution-*.tar` +- SHA256 checksums for saved image tarballs in `airgap-manifest.yaml` + +If an execute run fails midway, leave `airgap-build-state.yaml` in place and rerun +the same command. Completed expensive actions are reused when their artifacts +still exist. If you intentionally change the workflow or image plan before +finishing, move or remove `airgap-build-state.yaml` first; the runner will not +silently overwrite incomplete state from a different plan. + +Runtime dependency probes use Docker volumes named +`nemotron-airgap-pip-cache-` to avoid downloading the same wheels on +every probe loop. To reset them, run `docker volume ls | grep +nemotron-airgap-pip-cache` and remove the relevant volume with +`docker volume rm`. + +Large assets are not baked into images. The customer should stage them on +executor-visible persistent storage and reference them through config overrides +and `run.env.mounts`. + +During dependency discovery, the runner mounts the connected-machine checkout +into each execution image only to probe imports. The final execution image deliberately +does not bake this repo; the launcher image and the normal nemo-run/nemo-runspec +code transport provide the repo to the remote job at submission time. + +Repo logistics stay outside `airgap.yaml`. If a selected step config contains +`${auto_mount:git+...}`, the runner treats it as a connected-machine build input: +it fetches that pinned repo and bakes it into the derivative execution image at the +requested target path. Runtime jobs then use the baked image and do not clone +from GitHub. Site-specific data/model mounts remain in env profiles or step +overrides. + +If the connected machine is not the same architecture as the target cluster, +set `platform: linux/amd64` on the `launcher_image` or execution image entry in +`airgap.yaml`. If you need to minimize transfer size for several images that +share layers, `docker save -o all-images.tar tag1 tag2 ...` can be used after +the runner builds the images; a single tar deduplicates shared layers better +than one tar per image. + +The Dockerfiles expect the chosen base images to have Python and `pip` available +for bootstrapping small offline additions. The runtime defaults bake +`HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`, and +`WANDB_MODE=offline`; customers with an internal mirror can override those at +submission time through their env profile or `run.env.env_vars`. + +For SFT Megatron-Bridge, build with the normal config so the runner can discover +the pinned Megatron-LM and Megatron-Bridge auto-mounts: + +```yaml +workflow: + stages: + - sft/megatron_bridge:tiny +``` + +When submitting inside the airgap, use the deploy overlay config so those git +auto-mounts are cleared at runtime while persistent storage mounts from the env +profile still apply. Use the image printed by the runner under +`selected execution images`, or read it from `out/airgap-manifest.yaml` under +`step_execution_images`. + +```bash +uv run nemotron steps run sft/megatron_bridge \ + -c deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml \ + -b \ + run.env.container_image= +``` diff --git a/deploy/nemotron-customizer/airgap/SKILL.md b/deploy/nemotron-customizer/airgap/SKILL.md new file mode 100644 index 000000000..20a0d0798 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/SKILL.md @@ -0,0 +1,115 @@ +--- +name: nemotron-customizer-airgap +description: Prepare, validate, build, and use Nemotron Customizer airgap image bundles for offline clusters. Use when planning airgapped deployments, editing deploy/nemotron-customizer/airgap/airgap.yaml, selecting workflow targets, grouping step execution images, baking repo overlays or wheel additions, resuming airgap runner builds, or submitting `nemotron steps run` jobs inside an airgapped environment. +--- + +# Nemotron Customizer Airgap + +Use this skill to help an agent produce a connected-machine airgap bundle and +then submit Nemotron Customizer steps from the airgapped side. Keep it grounded +in the checked-in runner and manifests; do not invent a parallel packaging flow. + +## Read First + +- `deploy/nemotron-customizer/airgap/README.md` for the operator flow. +- `deploy/nemotron-customizer/airgap/airgap.yaml` for the current image map. +- `deploy/nemotron-customizer/airgap/runner.py` when changing behavior. +- `tests/deploy/test_airgap_runner.py` before editing runner logic. +- `deploy/nemotron-customizer/airgap/configs/` for runtime overlay configs. + +For selected steps, inspect the catalog through the CLI: + +```bash +uv run nemotron steps show --json +``` + +## Workflow + +1. Establish the side of the workflow: + - Connected machine: validate, build, save image tarballs. + - Airgapped side: load images, set env profiles, run selected steps. + +2. Gather the minimum inputs: + - Target steps and config names, for example `sft/megatron_bridge:tiny`. + - Target architecture or Docker platform, for example `linux/amd64`. + - Available base images and whether the connected machine can pull them. + - Airgapped env profile name, mounts, model/data/checkpoint locations. + - Whether destructive or expensive actions such as `--execute`, Docker build, + Docker volume cleanup, or state-file removal are explicitly allowed. + +3. Plan with the runner first: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml +``` + +Use `--target :` for one-off selections without editing YAML. +The runner expands dependencies from `dependencies`, validates selected step +files/configs, groups execution images, and prints selected execution images. + +4. Edit `airgap.yaml` only where the runner expects configuration: + - `workflow.stages` or CLI `--target` for selected customer steps. + - `dependencies` for explicit upstream Nemotron Customizer step outputs. + - `step_execution_images` for step-to-image mapping. + - `execution_images` for base image, tag, tar, platform, and import probes. + - `launcher_image` for the launcher container. + +5. Execute only when the user asks for a real build: + +```bash +uv run python deploy/nemotron-customizer/airgap/runner.py \ + --config deploy/nemotron-customizer/airgap/airgap.yaml \ + --execute +``` + +If a build fails midway, keep `airgap-build-state.yaml` and rerun the same +command. Remove or move that state only when intentionally changing the plan. + +6. On the airgapped side, use images from `out/airgap-manifest.yaml` under +`step_execution_images`. Submit with the plural CLI: + +```bash +uv run nemotron steps run \ + -c \ + -b \ + run.env.container_image= +``` + +For `sft/megatron_bridge`, prefer the airgap overlay configs under +`deploy/nemotron-customizer/airgap/configs/`; they clear runtime git auto-mounts +because the runner bakes those repos into the execution image. + +## Guardrails + +- Keep models, datasets, checkpoints, secrets, and customer files out of images. + Put them on persistent storage and reference them through config overrides and + `run.env.mounts`. +- Treat `${auto_mount:git+...}` as a connected-machine build input. The runner + bakes pinned repo overlays into execution images so airgapped jobs do not clone + from GitHub. +- Do not add missing packages blindly. Let `discover-execution-deps` and + import probes determine small additions; keep heavyweight framework deps in + the base image choice. +- Preserve offline defaults unless the user has an internal mirror: + `HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`, + and `WANDB_MODE=offline`. +- Use `nemotron steps ...`; do not reintroduce `nemotron step ...`. + +## Validation + +After edits to runner logic, YAML structure, or airgap docs, run: + +```bash +uv run pytest tests/deploy/test_airgap_runner.py -q +``` + +For CLI-facing examples, also smoke the command shape: + +```bash +uv run nemotron steps --help +uv run nemotron steps show data_prep/sft_packing --json +``` + +Do not run Docker build/save stages during validation unless the user explicitly +asked for a real connected-machine bundle build. diff --git a/deploy/nemotron-customizer/airgap/airgap.yaml b/deploy/nemotron-customizer/airgap/airgap.yaml new file mode 100644 index 000000000..61ae65c46 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/airgap.yaml @@ -0,0 +1,129 @@ +# One file controls the Nemotron Customizer airgap plan. +# +# Change workflow.stages to the steps the customer wants. The runner expands +# dependencies, validates those step files/configs, groups selected steps by +# execution image, then builds only the images needed for that selection. + +workflow: + name: sft-megatron-bridge + stages: + - sft/megatron_bridge:tiny + # Example SDG-only run: + # stages: + # - sdg/data_designer:tiny + # Example SDG -> SFT run: + # stages: + # - sdg/data_designer:tiny + # - sft/megatron_bridge:tiny + +build_stages: + - validate + - discover-execution-deps + - build-launcher-image + - build-execution-images + - save-images + +paths: + output_dir: deploy/nemotron-customizer/airgap/out + +launcher_image: + base_image: python:3.12-slim + tag: nemotron-customizer-launcher-airgap:latest + tar: launcher-image.tar + +# Central dependency map. Keep this small and explicit: it is only for steps +# that naturally require a previous Nemotron Customizer step output. +dependencies: + sft/megatron_bridge: + - data_prep/sft_packing:tiny + peft/megatron_bridge: + - data_prep/sft_packing:tiny + pretrain/megatron_bridge: + - data_prep/pretrain_prep:tiny + pretrain/automodel: + - data_prep/pretrain_prep:tiny + rl/nemo_rl/dpo: + - data_prep/rl_prep:tiny + rl/nemo_rl/rlhf: + - data_prep/rl_prep:tiny + rl/nemo_rl/rlvr: + - data_prep/rl_prep:tiny + # SDG can feed SFT or RL prep, but it is not forced as a dependency because + # many customers bring their own JSONL on persistent storage. + +# Step -> execution-image mapping. The runner only uses entries reached from +# workflow.stages after dependency expansion. +step_execution_images: + byob/mcq: nemo-data-designer + convert/hf_to_megatron: nemo-megatron + convert/megatron_to_hf: nemo-megatron + convert/merge_lora: nemo-megatron + curate/nemo_curator: nemo-curator + env/env_toml: launcher-python + eval/model_eval: nemo-eval + optimize/modelopt/distill: nemo-modelopt + optimize/modelopt/prune: nemo-modelopt + optimize/modelopt/quantize: nemo-modelopt + peft/automodel: nemo-automodel + peft/megatron_bridge: nemo-megatron + data_prep/pretrain_prep: nemo-megatron + data_prep/rl_prep: nemo-rl + data_prep/sft_packing: nemo-megatron + pretrain/automodel: nemo-automodel + pretrain/megatron_bridge: nemo-megatron + rl/nemo_rl/dpo: nemo-rl + rl/nemo_rl/rlhf: nemo-rl + rl/nemo_rl/rlvr: nemo-rl + sdg/data_designer: nemo-data-designer + sft/automodel: nemo-automodel + sft/megatron_bridge: nemo-megatron + translate/nemo_curator: nemo-curator + +execution_images: + launcher-python: + base_image: python:3.12-slim + tag: nemotron-customizer-python-execution-airgap:latest + tar: execution-python-image.tar + + nemo-megatron: + base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano + tag: nemotron-customizer-nemo-megatron-airgap:latest + tar: execution-nemo-megatron-image.tar + required_imports: [] + + nemo-automodel: + base_image: nvcr.io/nvidia/nemo-automodel:26.04 + tag: nemotron-customizer-nemo-automodel-airgap:latest + tar: execution-nemo-automodel-image.tar + required_imports: [] + + nemo-rl: + base_image: nvcr.io/nvidia/nemo-rl:v0.6.0 + tag: nemotron-customizer-nemo-rl-airgap:latest + tar: execution-nemo-rl-image.tar + required_imports: [] + + nemo-modelopt: + base_image: nvcr.io/nvidia/nemo:26.02 + tag: nemotron-customizer-nemo-modelopt-airgap:latest + tar: execution-nemo-modelopt-image.tar + required_imports: [] + + nemo-curator: + base_image: nvcr.io/nvidia/nemo-curator:25.07 + tag: nemotron-customizer-nemo-curator-airgap:latest + tar: execution-nemo-curator-image.tar + required_imports: [] + + nemo-data-designer: + base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano + tag: nemotron-customizer-nemo-data-designer-airgap:latest + tar: execution-nemo-data-designer-image.tar + required_imports: + - data_designer + + nemo-eval: + base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano + tag: nemotron-customizer-nemo-eval-airgap:latest + tar: execution-nemo-eval-image.tar + required_imports: [] diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml new file mode 100644 index 000000000..a2e4b828c --- /dev/null +++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml @@ -0,0 +1,12 @@ +# Airgap runtime overlay for sft/megatron_bridge:default. +# +# The connected-machine airgap runner bakes the auto_mount repos from the base +# config into the derivative execution image. At runtime, clear those git auto-mounts +# so the airgapped job does not clone from GitHub. Env-profile persistent +# storage mounts still append normally. + +defaults: ../../../../src/nemotron/steps/sft/megatron_bridge/config/default.yaml + +run: + env: + mounts: [] diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml new file mode 100644 index 000000000..eb71f5f96 --- /dev/null +++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml @@ -0,0 +1,12 @@ +# Airgap runtime overlay for sft/megatron_bridge:tiny. +# +# The connected-machine airgap runner bakes the auto_mount repos from the base +# config into the derivative execution image. At runtime, clear those git auto-mounts +# so the airgapped job does not clone from GitHub. Env-profile persistent +# storage mounts still append normally. + +defaults: ../../../../src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml + +run: + env: + mounts: [] diff --git a/deploy/nemotron-customizer/airgap/runner.py b/deploy/nemotron-customizer/airgap/runner.py new file mode 100644 index 000000000..c6cf33d4e --- /dev/null +++ b/deploy/nemotron-customizer/airgap/runner.py @@ -0,0 +1,1244 @@ +#!/usr/bin/env python3 +"""Lightweight airgap image runner for Nemotron Customizer. + +This file intentionally lives under deploy/nemotron-customizer/airgap instead +of adding a new step. It is a connected-machine helper that validates requested +steps, discovers small execution-image Python gaps, builds launcher/execution images, and +saves image tarballs. +""" + +from __future__ import annotations + +import argparse +import ast +import hashlib +import importlib.metadata as metadata +import json +import re +import shutil +import subprocess +import sys +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import tomllib +import yaml + +AIRGAP_DIR = Path(__file__).resolve().parent +REPO_ROOT = AIRGAP_DIR.parents[2] +SRC_ROOT = REPO_ROOT / "src" +STEP_ROOT = SRC_ROOT / "nemotron" / "steps" +DEFAULT_OUTPUT_DIR = AIRGAP_DIR / "out" +UV_VERSION = "0.11.1" +PROGRESS_STATE = "airgap-build-state.yaml" +COMPLETE_STATE = "airgap-build-complete.yaml" +LOCAL_PREFIXES = ("nemotron", "nemo_runspec") +CORE_IMPORTS = { + "datasets", + "megatron", + "nemo", + "numpy", + "ray", + "torch", + "transformers", + "triton", + "vllm", +} +IMPORT_ALIASES = { + "yaml": "pyyaml", + "pydantic_settings": "pydantic-settings", + "huggingface_hub": "huggingface-hub", + "cosmos_xenna": "cosmos-xenna", + "data_designer": "data-designer", + "nemo_curator": "nemo-curator", +} + + +@dataclass(frozen=True) +class Target: + step_id: str + config: str | None = None + + @property + def spec(self) -> str: + return f"{self.step_id}:{self.config}" if self.config else self.step_id + + +@dataclass +class StepInfo: + target: Target + step_dir: Path + step_py: Path + step_toml: Path + config_path: Path | None + module: str + mounts: list[Any] = field(default_factory=list) + repo_overlays: list[RepoOverlay] = field(default_factory=list) + + +@dataclass(frozen=True) +class RepoOverlay: + repo: str + url: str + ref: str + target: str + + +@dataclass +class ExecutionGroup: + name: str + base_image: str + tag: str + tar: Path + steps: list[str] + platform: str | None = None + required_imports: set[str] = field(default_factory=set) + repo_overlays: list[RepoOverlay] = field(default_factory=list) + pip_no_deps: bool = True + candidate_imports: set[str] = field(default_factory=set) + missing_imports: list[str] = field(default_factory=list) + missing_core_imports: list[str] = field(default_factory=list) + requirements: list[str] = field(default_factory=list) + requirements_path: Path | None = None + repo_overlays_path: Path | None = None + selected_image: str | None = None + image_names: set[str] = field(default_factory=set) + + +@dataclass +class RunState: + path: Path + done_path: Path + data: dict[str, Any] + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Build Nemotron Customizer airgap images from one YAML file.") + parser.add_argument("--config", default=str(AIRGAP_DIR / "airgap.yaml"), help="Airgap runner YAML.") + parser.add_argument("--execute", action="store_true", help="Run docker/git commands. Default prints the plan.") + parser.add_argument("--stage", action="append", help="Stage to run. Repeatable. Defaults to config stages.") + parser.add_argument( + "--target", + action="append", + help="Nemotron step target step-id[:config]. Repeatable. Overrides workflow.stages.", + ) + args = parser.parse_args(argv) + + config_path = resolve_input_path(Path(args.config)) + cfg = load_yaml(config_path) + if args.target: + cfg = with_workflow_targets(cfg, normalize_target_specs(args.target)) + stages = normalize_stages(args.stage or cfg.get("build_stages") or cfg.get("stages") or []) + output_dir = resolve_repo_path(Path(cfg.get("paths", {}).get("output_dir", DEFAULT_OUTPUT_DIR))) + if "build-execution-images" in stages: + validate_docker_context_path(output_dir, field="paths.output_dir") + output_dir.mkdir(parents=True, exist_ok=True) + run_state = load_or_start_run_state( + output_dir, + config_path=config_path, + cfg=cfg, + stages=stages, + execute=args.execute, + ) + saved_images: list[dict[str, Any]] = [] + workflow = cfg.get("workflow") if isinstance(cfg.get("workflow"), Mapping) else {} + + print(f"[airgap] config={config_path}") + print(f"[airgap] mode={'execute' if args.execute else 'plan'}") + print(f"[airgap] stages={', '.join(stages)}") + + expanded_targets: list[Target] = [] + step_infos: dict[str, StepInfo] = {} + groups: list[ExecutionGroup] = [] + workflow_manifest: dict[str, Any] = { + "stages": list(workflow.get("stages") or []), + } + if workflow.get("name"): + workflow_manifest["name"] = workflow.get("name") + manifest: dict[str, Any] = { + "schema_version": 1, + "workflow": workflow_manifest, + "output_dir": str(output_dir), + "build_stages": stages, + } + + if "validate" in stages or any(stage_needs_targets(stage) for stage in stages): + begin_action(run_state, "validate") + expanded_targets = expand_targets(cfg) + step_infos = validate_targets(expanded_targets) + manifest["targets"] = [step_to_manifest(info) for info in step_infos.values()] + print(f"[validate] {len(step_infos)} target(s) ok") + complete_action(run_state, "validate", {"targets": [target.spec for target in expanded_targets]}) + + if any(stage in stages for stage in ("discover-execution-deps", "build-execution-images", "save-images")): + groups = execution_groups(cfg, output_dir=output_dir, step_infos=step_infos) + manifest["execution_groups"] = [execution_group_manifest(group) for group in groups] + + if "discover-execution-deps" in stages: + if action_completed(run_state, "discover-execution-deps") and hydrate_discovered_groups(run_state, groups): + print("[resume] skipping discover-execution-deps; using saved probe results") + else: + begin_action(run_state, "discover-execution-deps") + locked_versions = locked_package_versions(REPO_ROOT / "uv.lock") + for group in groups: + discover_execution_deps( + group, + step_infos=step_infos, + locked_versions=locked_versions, + execute=args.execute, + ) + remember_discovered_groups(run_state, groups) + complete_action(run_state, "discover-execution-deps", {"groups": [group.name for group in groups]}) + manifest["execution_groups"] = [execution_group_manifest(group) for group in groups] + + if "build-launcher-image" in stages: + launcher_image = cfg.get("launcher_image", {}) + launcher_image_tag = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest") + platform = launcher_image_platform(launcher_image) + action = "build-launcher-image" + if action_completed(run_state, action) and docker_image_exists(launcher_image_tag, platform=platform): + print(f"[resume] skipping {action}; image exists: {launcher_image_tag}") + else: + begin_action(run_state, action) + status = build_launcher_image(launcher_image, execute=args.execute) + if status: + return status + complete_action(run_state, action, {"image": launcher_image_tag}) + manifest["launcher_image"] = launcher_image_manifest(launcher_image) + + if "build-execution-images" in stages: + clean_stale_group_dirs(output_dir, groups, execute=args.execute) + for group in groups: + action = f"build-execution-image:{group.name}" + if action_completed(run_state, action) and docker_image_exists(group.tag, platform=group.platform): + print(f"[resume] skipping {action}; image exists: {group.tag}") + else: + begin_action(run_state, action) + status = build_execution_image(group, output_dir=output_dir, execute=args.execute) + if status: + return status + complete_action(run_state, action, {"image": group.tag}) + manifest["execution_groups"] = [execution_group_manifest(group) for group in groups] + + if "save-images" in stages: + launcher_image = cfg.get("launcher_image", {}) + if launcher_image: + output = output_dir / str(launcher_image.get("tar", "launcher-image.tar")) + launcher_image_tag = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest") + action = f"save-image:{launcher_image_tag}" + if action_completed(run_state, action) and output.exists(): + print(f"[resume] skipping {action}; tar exists: {output}") + else: + begin_action(run_state, action) + status = save_image(launcher_image_tag, output, args.execute) + if status: + return status + complete_action(run_state, action, {"tar": str(output)}) + saved_images.append( + saved_image_manifest( + launcher_image_tag, + output, + execute=args.execute, + role="launcher", + name="launcher", + ) + ) + for group in groups: + action = f"save-image:{group.tag}" + if action_completed(run_state, action) and group.tar.exists(): + print(f"[resume] skipping {action}; tar exists: {group.tar}") + else: + begin_action(run_state, action) + status = save_image(group.tag, group.tar, args.execute) + if status: + return status + complete_action(run_state, action, {"tar": str(group.tar)}) + saved_images.append( + saved_image_manifest(group.tag, group.tar, execute=args.execute, role="execution", name=group.name) + ) + + manifest["persistent_assets"] = { + "policy": "models, datasets, checkpoints, and customer data stay on executor-visible persistent storage", + "mounts_from_configs": collect_mounts(step_infos.values()), + "baked_repo_overlays": [repo_overlay_manifest(item) for item in collect_repo_overlays(step_infos.values())], + } + manifest["step_execution_images"] = step_execution_image_manifest(groups) + manifest["saved_images"] = saved_images + manifest_path = output_dir / "airgap-manifest.yaml" + manifest_path.write_text(yaml.safe_dump(manifest, sort_keys=False), encoding="utf-8") + complete_run_state(run_state, manifest_path=manifest_path) + print(f"[airgap] wrote {manifest_path}") + if groups: + print("[airgap] selected execution images:") + for group in groups: + image = group.selected_image or group.tag + for step_id in group.steps: + print(f" - {step_id}: {image}") + return 0 + + +def load_yaml(path: Path) -> dict[str, Any]: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + if not isinstance(data, dict): + raise SystemExit(f"{path}: top-level YAML must be a mapping") + return data + + +def normalize_target_specs(values: Iterable[str]) -> list[str]: + out: list[str] = [] + for raw in values: + for item in str(raw).split(","): + target = item.strip() + if target: + out.append(target) + return out + + +def with_workflow_targets(cfg: Mapping[str, Any], targets: list[str]) -> dict[str, Any]: + out = dict(cfg) + existing = out.get("workflow") + workflow = dict(existing) if isinstance(existing, Mapping) else {} + workflow["stages"] = targets + out["workflow"] = workflow + return out + + +def resolve_input_path(path: Path) -> Path: + if path.is_absolute() or path.exists(): + return path + repo_path = REPO_ROOT / path + return repo_path if repo_path.exists() else path + + +def resolve_repo_path(path: Path) -> Path: + return path if path.is_absolute() else REPO_ROOT / path + + +def docker_context_path(path: Path) -> str: + resolved = path.resolve() + try: + return resolved.relative_to(REPO_ROOT).as_posix() + except ValueError as exc: + raise SystemExit(f"{path} must live under the repo root because docker build context is {REPO_ROOT}") from exc + + +def validate_docker_context_path(path: Path, *, field: str) -> None: + try: + docker_context_path(path) + except SystemExit as exc: + message = f"{field}={path} must live under the repo root because Docker builds use {REPO_ROOT}" + raise SystemExit(message) from exc + + +def load_or_start_run_state( + output_dir: Path, + *, + config_path: Path, + cfg: Mapping[str, Any], + stages: list[str], + execute: bool, +) -> RunState | None: + if not execute: + return None + path = output_dir / PROGRESS_STATE + done_path = output_dir / COMPLETE_STATE + signature = run_signature(config_path=config_path, cfg=cfg, stages=stages) + if path.exists(): + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + if not isinstance(data, dict): + raise SystemExit(f"{path} must contain YAML mapping state") + if data.get("signature") != signature: + raise SystemExit( + f"{path} is an incomplete airgap run for a different plan. " + f"Finish it, move it aside, or remove it before starting a new plan." + ) + print(f"[resume] found incomplete run state: {path}") + return RunState(path=path, done_path=done_path, data=data) + + workflow = cfg.get("workflow") if isinstance(cfg.get("workflow"), Mapping) else {} + data = { + "schema_version": 1, + "signature": signature, + "config": str(config_path.resolve()), + "workflow_stages": list(workflow.get("stages") or []), + "build_stages": stages, + "started_at": timestamp(), + "completed_actions": {}, + "discovered_groups": {}, + } + if done_path.exists(): + data["previous_complete"] = str(done_path) + state = RunState(path=path, done_path=done_path, data=data) + write_run_state(state) + print(f"[airgap] progress state={path}") + return state + + +def run_signature(*, config_path: Path, cfg: Mapping[str, Any], stages: list[str]) -> str: + payload = { + "config": str(config_path.resolve()), + "stages": stages, + "workflow": cfg.get("workflow"), + "dependencies": cfg.get("dependencies"), + "step_execution_images": cfg.get("step_execution_images"), + "execution_images": cfg.get("execution_images"), + "launcher_image": cfg.get("launcher_image"), + } + text = yaml.safe_dump(payload, sort_keys=True) + return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] + + +def timestamp() -> str: + return datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z") + + +def write_run_state(state: RunState | None) -> None: + if state is None: + return + state.data["updated_at"] = timestamp() + state.path.write_text(yaml.safe_dump(state.data, sort_keys=False), encoding="utf-8") + + +def begin_action(state: RunState | None, action: str) -> None: + if state is None: + return + state.data["current_action"] = {"name": action, "started_at": timestamp()} + write_run_state(state) + + +def complete_action(state: RunState | None, action: str, details: Mapping[str, Any] | None = None) -> None: + if state is None: + return + completed = state.data.setdefault("completed_actions", {}) + completed[action] = {"completed_at": timestamp(), **dict(details or {})} + if (state.data.get("current_action") or {}).get("name") == action: + state.data.pop("current_action", None) + write_run_state(state) + + +def action_completed(state: RunState | None, action: str) -> bool: + if state is None: + return False + return action in (state.data.get("completed_actions") or {}) + + +def remember_discovered_groups(state: RunState | None, groups: Iterable[ExecutionGroup]) -> None: + if state is None: + return + state.data["discovered_groups"] = { + group.name: { + "candidate_imports": sorted(group.candidate_imports), + "missing_imports": group.missing_imports, + "missing_core_imports": group.missing_core_imports, + "requirements": group.requirements, + } + for group in groups + } + write_run_state(state) + + +def hydrate_discovered_groups(state: RunState | None, groups: Iterable[ExecutionGroup]) -> bool: + if state is None: + return False + saved = state.data.get("discovered_groups") or {} + groups = list(groups) + if not all(group.name in saved for group in groups): + return False + for group in groups: + item = saved[group.name] + group.candidate_imports = set(item.get("candidate_imports") or []) + group.missing_imports = list(item.get("missing_imports") or []) + group.missing_core_imports = list(item.get("missing_core_imports") or []) + group.requirements = list(item.get("requirements") or []) + return True + + +def complete_run_state(state: RunState | None, *, manifest_path: Path) -> None: + if state is None: + return + state.data.pop("current_action", None) + state.data["manifest"] = str(manifest_path) + state.data["completed_at"] = timestamp() + state.done_path.write_text(yaml.safe_dump(state.data, sort_keys=False), encoding="utf-8") + state.path.unlink(missing_ok=True) + print(f"[airgap] complete state={state.done_path}") + + +def normalize_stages(stages: Iterable[str]) -> list[str]: + out: list[str] = [] + for raw in stages: + for item in str(raw).split(","): + stage = item.strip() + if stage and stage not in out: + out.append(stage) + out = out or [ + "validate", + "discover-execution-deps", + "build-launcher-image", + "build-execution-images", + "save-images", + ] + + def ensure_before(required: str, requested: str) -> None: + if requested not in out or required in out: + return + index = out.index(requested) + out.insert(index, required) + print(f"[airgap] auto-adding stage {required!r} because {requested!r} was requested") + + # Apply prerequisite edges from later stages toward earlier stages. Each + # insertion is idempotent, so a user can ask for any suffix of the pipeline. + ensure_before("build-execution-images", "save-images") + ensure_before("build-launcher-image", "save-images") + ensure_before("discover-execution-deps", "build-execution-images") + ensure_before("validate", "discover-execution-deps") + ensure_before("validate", "build-execution-images") + ensure_before("validate", "save-images") + order = { + "validate": 0, + "discover-execution-deps": 1, + "build-launcher-image": 2, + "build-execution-images": 3, + "save-images": 4, + } + out.sort(key=lambda stage: order.get(stage, len(order))) + return out + + +def stage_needs_targets(stage: str) -> bool: + return stage in {"discover-execution-deps", "build-execution-images", "save-images"} + + +def expand_targets(cfg: Mapping[str, Any]) -> list[Target]: + workflow = cfg.get("workflow") or {} + raw_targets = [parse_target(item) for item in workflow.get("stages") or []] + deps = cfg.get("dependencies") or workflow.get("dependencies") or {} + out: list[Target] = [] + seen: set[str] = set() + visiting: set[str] = set() + stack: list[str] = [] + + def add(target: Target) -> None: + if target.spec in visiting: + start = stack.index(target.spec) if target.spec in stack else 0 + cycle = " -> ".join([*stack[start:], target.spec]) + raise SystemExit(f"cyclic airgap dependency detected: {cycle}") + if target.spec in seen: + return + visiting.add(target.spec) + stack.append(target.spec) + for dep in deps.get(target.step_id, []) or []: + add(parse_target(dep)) + stack.pop() + visiting.remove(target.spec) + seen.add(target.spec) + out.append(target) + + for target in raw_targets: + add(target) + if not out: + raise SystemExit("workflow.stages must list at least one step") + return out + + +def parse_target(value: str) -> Target: + step_id, sep, config = str(value).partition(":") + step_id = step_id.strip() + config = config.strip() if sep else "" + if not step_id: + raise SystemExit(f"invalid target {value!r}; expected step-id[:config]") + return Target(step_id=step_id, config=config or None) + + +def validate_targets(targets: Iterable[Target]) -> dict[str, StepInfo]: + out: dict[str, StepInfo] = {} + for target in targets: + step_dir = STEP_ROOT / target.step_id + step_py = step_dir / "step.py" + step_toml = step_dir / "step.toml" + config_path = step_dir / "config" / f"{target.config}.yaml" if target.config else None + missing = [ + path for path in (step_dir, step_py, step_toml, config_path) if path is not None and not path.exists() + ] + if missing: + raise SystemExit(f"{target.spec}: missing required path(s): {', '.join(str(path) for path in missing)}") + module = "nemotron.steps." + target.step_id.replace("/", ".") + ".step" + out[target.step_id] = StepInfo( + target=target, + step_dir=step_dir, + step_py=step_py, + step_toml=step_toml, + config_path=config_path, + module=module, + mounts=read_config_mounts(config_path), + repo_overlays=read_config_repo_overlays(config_path), + ) + return out + + +def read_config_mounts(config_path: Path | None) -> list[Any]: + if config_path is None or not config_path.exists(): + return [] + try: + data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} + except Exception: + return [] + if not isinstance(data, Mapping): + return [] + run = data.get("run") if isinstance(data.get("run"), Mapping) else {} + env = run.get("env") if isinstance(run.get("env"), Mapping) else {} + mounts = env.get("mounts") if isinstance(env, Mapping) else [] + return mounts if isinstance(mounts, list) else [] + + +def execution_groups( + cfg: Mapping[str, Any], + *, + output_dir: Path, + step_infos: Mapping[str, StepInfo] | None = None, +) -> list[ExecutionGroup]: + if not step_infos: + raise SystemExit("validate must run before execution images can be planned") + if not cfg.get("step_execution_images"): + raise SystemExit("airgap.yaml must define step_execution_images for the selected workflow stages") + return execution_groups_from_step_execution_images(cfg, output_dir=output_dir, step_infos=step_infos) + + +def execution_groups_from_step_execution_images( + cfg: Mapping[str, Any], + *, + output_dir: Path, + step_infos: Mapping[str, StepInfo], +) -> list[ExecutionGroup]: + step_execution_images = normalize_step_execution_images(cfg.get("step_execution_images") or {}) + image_defs = normalize_execution_images(cfg.get("execution_images") or {}) + merged: dict[str, ExecutionGroup] = {} + + for step_id in step_infos: + image_name = step_execution_images.get(step_id) + if not image_name: + raise SystemExit(f"{step_id}: missing step_execution_images entry in airgap.yaml") + image_def = image_defs.get(image_name) + if image_def is None: + raise SystemExit(f"{step_id}: step_execution_images points to unknown execution image {image_name!r}") + base = str(image_def.get("base_image") or "").strip() + if not base: + raise SystemExit(f"execution_images.{image_name}.base_image is required") + repo_overlays = getattr(step_infos[step_id], "repo_overlays", []) + group_key = execution_group_key(base, repo_overlays) + group = merged.get(group_key) + if group is None: + suffix = short_hash( + { + "base_image": base, + "repo_overlays": [repo_overlay_manifest(item) for item in repo_overlays], + } + ) + group = ExecutionGroup( + name=f"{image_name}-{suffix}", + base_image=base, + tag="", + tar=output_dir / "execution-image.tar", + steps=[], + platform=str(image_def["platform"]) if image_def.get("platform") else None, + pip_no_deps=bool(image_def.get("pip_no_deps", True)), + repo_overlays=list(repo_overlays), + ) + merged[group_key] = group + group.image_names.add(image_name) + group.steps.append(step_id) + group.required_imports.update(str(name) for name in image_def.get("required_imports") or []) + group.repo_overlays = merge_repo_overlays( + group.repo_overlays, + repo_overlays, + ) + for group in merged.values(): + finalize_execution_group_name(group, image_defs=image_defs, output_dir=output_dir) + return list(merged.values()) + + +def finalize_execution_group_name( + group: ExecutionGroup, + *, + image_defs: Mapping[str, Mapping[str, Any]], + output_dir: Path, +) -> None: + names = sorted(group.image_names) + suffix = short_hash( + { + "base_image": group.base_image, + "repo_overlays": [repo_overlay_manifest(item) for item in group.repo_overlays], + } + ) + if len(names) == 1: + image_name = names[0] + image_def = image_defs[image_name] + tag = str(image_def.get("tag") or f"nemotron-execution-{sanitize(image_name)}:airgap") + tar = output_dir / str(image_def.get("tar") or f"execution-{sanitize(image_name)}.tar") + group.name = f"{image_name}-{suffix}" + else: + merged_name = "-".join(sanitize(name) for name in names) + tag = f"nemotron-customizer-{merged_name}-airgap:latest" + tar = output_dir / f"execution-{merged_name}-image.tar" + group.name = f"{merged_name}-{suffix}" + group.tag = tag_with_suffix(tag, suffix) + group.tar = tar_with_suffix(tar, suffix) + group.selected_image = group.tag + + +def execution_group_key(base_image: str, repo_overlays: Iterable[RepoOverlay]) -> str: + overlays = sorted( + (repo_overlay_manifest(item) for item in repo_overlays), + key=lambda item: (item["target"], item["url"], item["ref"], item["repo"]), + ) + payload = { + "base_image": base_image, + "repo_overlays": overlays, + } + return json.dumps(payload, sort_keys=True) + + +def short_hash(value: Any) -> str: + payload = json.dumps(value, sort_keys=True, separators=(",", ":")).encode("utf-8") + return hashlib.sha256(payload).hexdigest()[:8] + + +def tag_with_suffix(tag: str, suffix: str) -> str: + image, separator, digest = tag.partition("@") + last = image.rsplit("/", 1)[-1] + if ":" in last: + name, version = image.rsplit(":", 1) + image = f"{name}-{suffix}:{version}" + else: + image = f"{image}-{suffix}" + return f"{image}{separator}{digest}" if separator else image + + +def tar_with_suffix(path: Path, suffix: str) -> Path: + return path.with_name(f"{path.stem}-{suffix}{path.suffix}") + + +def normalize_step_execution_images(raw: Mapping[str, Any]) -> dict[str, str]: + out: dict[str, str] = {} + for step_id, value in raw.items(): + if isinstance(value, str): + out[str(step_id)] = value + elif isinstance(value, Mapping) and value.get("execution_image"): + out[str(step_id)] = str(value["execution_image"]) + return out + + +def normalize_execution_images(raw: Any) -> dict[str, Mapping[str, Any]]: + if isinstance(raw, Mapping): + return {str(name): spec for name, spec in raw.items() if isinstance(spec, Mapping)} + return {} + + +def read_config_repo_overlays(config_path: Path | None) -> list[RepoOverlay]: + if config_path is None or not config_path.exists(): + return [] + text = config_path.read_text(encoding="utf-8") + overlays: list[RepoOverlay] = [] + pattern = re.compile(r"\$\{auto_mount:(git\+[^,}]+),([^}]+)\}") + for spec, target in pattern.findall(text): + overlays.append(parse_git_overlay(spec, target)) + return merge_repo_overlays([], overlays) + + +def parse_git_overlay(spec: str, target: str) -> RepoOverlay: + if not spec.startswith("git+"): + raise SystemExit(f"invalid auto_mount git spec: {spec!r}") + url_and_ref = spec[4:] + if "@" not in url_and_ref: + raise SystemExit(f"invalid auto_mount git spec missing @ref: {spec!r}") + url, ref = url_and_ref.rsplit("@", 1) + repo = url.rstrip("/").split("/")[-1] + if repo.endswith(".git"): + repo = repo[:-4] + return RepoOverlay(repo=repo, url=url, ref=ref, target=target.strip()) + + +def merge_repo_overlays(existing: list[RepoOverlay], incoming: Iterable[RepoOverlay]) -> list[RepoOverlay]: + out = list(existing) + seen = {(item.repo, item.url, item.ref, item.target) for item in out} + for item in incoming: + key = (item.repo, item.url, item.ref, item.target) + if key not in seen: + out.append(item) + seen.add(key) + return out + + +def discover_execution_deps( + group: ExecutionGroup, + *, + step_infos: Mapping[str, StepInfo], + locked_versions: Mapping[str, str], + execute: bool, +) -> None: + imports: set[str] = set(group.required_imports) + for step_id in group.steps: + imports.update(discover_external_imports(step_infos[step_id].step_py)) + group.candidate_imports = imports + if execute: + missing = probe_step_modules( + group.base_image, + [step_infos[step_id].module for step_id in group.steps], + required_imports=imports, + locked_versions=locked_versions, + pip_no_deps=group.pip_no_deps, + platform=group.platform, + ) + else: + missing = probe_missing_imports(group.base_image, sorted(imports), execute=False, platform=group.platform) + group.missing_imports = sorted(set(missing)) + group.missing_core_imports = sorted(name for name in missing if name.split(".", 1)[0] in CORE_IMPORTS) + installable = sorted(name for name in group.missing_imports if name not in group.missing_core_imports) + group.requirements = sorted(requirement_for_import(name, locked_versions) for name in installable) + + +def discover_external_imports(start: Path) -> set[str]: + external: set[str] = set() + try: + tree = ast.parse(start.read_text(encoding="utf-8")) + except SyntaxError: + return external + for node in ast.walk(tree): + imported: list[str] = [] + if isinstance(node, ast.Import): + imported = [alias.name for alias in node.names] + elif isinstance(node, ast.ImportFrom) and not node.level and node.module: + imported = [node.module] + for name in imported: + root = name.split(".", 1)[0] + if root in LOCAL_PREFIXES or is_stdlib(root): + continue + external.add(root) + return external + + +def is_stdlib(root: str) -> bool: + if root in sys.builtin_module_names: + return True + stdlib_names = getattr(sys, "stdlib_module_names", set()) + if root in stdlib_names: + return True + return False + + +def probe_missing_imports(image: str, imports: list[str], *, execute: bool, platform: str | None = None) -> list[str]: + if not imports: + return [] + code = ( + "import importlib.util,json;" + f"mods={imports!r};" + "missing=[m for m in mods if importlib.util.find_spec(m) is None];" + "print(json.dumps(missing))" + ) + cmd = ["docker", "run", "--rm", "--pull", "never"] + if platform: + cmd.extend(["--platform", platform]) + cmd.extend([image, "python", "-c", code]) + if not execute: + print_cmd(cmd) + return [] + ensure_image(image, platform=platform) + result = subprocess.run(cmd, check=False, capture_output=True, text=True, cwd=REPO_ROOT) + if result.returncode != 0: + print(result.stderr or result.stdout, file=sys.stderr) + raise SystemExit(result.returncode) + return [str(item) for item in json.loads(result.stdout.strip() or "[]")] + + +def probe_step_modules( + image: str, + modules: list[str], + *, + required_imports: Iterable[str], + locked_versions: Mapping[str, str], + pip_no_deps: bool, + platform: str | None = None, +) -> list[str]: + """Import selected step modules in the execution image and discover missing imports. + + The loop installs only the packages it has already identified, in an + ephemeral container, so the final requirements file stays based on actual + import failures rather than broad static guesses. + """ + + ensure_image(image, platform=platform) + missing: list[str] = [] + requirements: list[str] = [] + imports = sorted(set(required_imports)) + import_code = "import importlib;" + import_code += "".join(f"importlib.import_module({module!r});" for module in imports) + import_code += "".join(f"importlib.import_module({module!r});" for module in modules) + for _ in range(20): + install = "" + if requirements: + no_deps = "--no-deps " if pip_no_deps else "" + install = "python -m pip install " + no_deps + install += " ".join(shlex_quote(req) for req in requirements) + install += ( + " >/tmp/nemotron-airgap-pip.log 2>&1 " + "|| { echo '[airgap-pip] failed:'; cat /tmp/nemotron-airgap-pip.log; exit 1; } && " + ) + cmd = [ + "docker", + "run", + "--rm", + "--pull", + "never", + "--mount", + f"type=volume,source={pip_cache_volume(platform)},target=/root/.cache/pip", + "-v", + f"{REPO_ROOT}:/workspace/Nemotron:ro", + "-w", + "/workspace/Nemotron", + "-e", + "PYTHONPATH=/workspace/Nemotron/src", + ] + if platform: + cmd.extend(["--platform", platform]) + cmd.extend([image, "bash", "-lc", install + "python -c " + shlex_quote(import_code)]) + result = subprocess.run(cmd, check=False, capture_output=True, text=True, cwd=REPO_ROOT) + if result.returncode == 0: + return missing + text = result.stderr + "\n" + result.stdout + match = re.search(r"(?:ModuleNotFoundError|ImportError):\s+No module named ['\"]([^'\"]+)['\"]", text) + if not match: + print(text, file=sys.stderr) + raise SystemExit(result.returncode) + import_name = match.group(1).split(".", 1)[0] + if import_name not in missing: + missing.append(import_name) + if import_name in CORE_IMPORTS: + print(f"[probe] base image is missing core import {import_name!r}; choose a compatible execution image") + return missing + requirement = requirement_for_import(import_name, locked_versions) + if requirement in requirements: + return missing + requirements.append(requirement) + raise SystemExit(f"import probe did not converge for {image}") + + +def requirement_for_import(import_name: str, locked_versions: Mapping[str, str]) -> str: + package = package_for_import(import_name) + version = locked_versions.get(normalize_package(package)) + return f"{package}=={version}" if version else package + + +def package_for_import(import_name: str) -> str: + if import_name in IMPORT_ALIASES: + return IMPORT_ALIASES[import_name] + packages = metadata.packages_distributions().get(import_name) + if packages: + return normalize_package(packages[0]) + return import_name.replace("_", "-") + + +def locked_package_versions(lock_path: Path) -> dict[str, str]: + if not lock_path.exists(): + return {} + data = tomllib.loads(lock_path.read_text(encoding="utf-8")) + versions: dict[str, str] = {} + for package in data.get("package", []) or []: + name = package.get("name") + version = package.get("version") + if isinstance(name, str) and isinstance(version, str): + versions[normalize_package(name)] = version + return versions + + +def normalize_package(name: str) -> str: + return re.sub(r"[-_.]+", "-", name).lower() + + +def build_launcher_image(launcher_image: Mapping[str, Any], *, execute: bool) -> int: + image = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest") + base = str(launcher_image.get("base_image") or "python:3.12-slim") + platform = launcher_image_platform(launcher_image) + cmd = [ + "docker", + "build", + "-f", + str(AIRGAP_DIR / "Dockerfile.launcher"), + "--build-arg", + f"BASE_IMAGE={base}", + "--build-arg", + f"UV_VERSION={UV_VERSION}", + "-t", + image, + ".", + ] + if platform: + cmd[2:2] = ["--platform", platform] + if execute: + ensure_image(base, platform=platform) + return run_or_print(cmd, execute) + + +def launcher_image_platform(launcher_image: Mapping[str, Any]) -> str | None: + return str(launcher_image["platform"]) if launcher_image.get("platform") else None + + +def build_execution_image(group: ExecutionGroup, *, output_dir: Path, execute: bool) -> int: + group_dir = output_dir / "execution-context" / group.name + group_dir.mkdir(parents=True, exist_ok=True) + group.requirements_path = group_dir / f"requirements-{group.name}.txt" + group.requirements_path.write_text( + "\n".join(group.requirements) + ("\n" if group.requirements else ""), + encoding="utf-8", + ) + repos_root = output_dir / "repo-overlays" / group.name + prepare_repo_overlays(group, repos_root=repos_root, execute=execute) + group.repo_overlays_path = group_dir / f"repo-overlays-{group.name}.json" + group.repo_overlays_path.write_text( + json.dumps([repo_overlay_build_manifest(item) for item in group.repo_overlays], indent=2) + "\n", + encoding="utf-8", + ) + cmd = [ + "docker", + "build", + "-f", + str(AIRGAP_DIR / "Dockerfile.execution"), + "--build-arg", + f"BASE_IMAGE={group.base_image}", + "--build-arg", + f"EXECUTION_REQUIREMENTS={docker_context_path(group.requirements_path)}", + "--build-arg", + f"REPO_OVERLAYS={docker_context_path(group.repo_overlays_path)}", + "--build-arg", + f"REPO_OVERLAYS_DIR={docker_context_path(repos_root)}", + "--build-arg", + f"PIP_NO_DEPS={'true' if group.pip_no_deps else 'false'}", + "-t", + group.tag, + ".", + ] + if group.platform: + cmd[2:2] = ["--platform", group.platform] + if execute: + ensure_image(group.base_image, platform=group.platform) + return run_or_print(cmd, execute) + + +def prepare_repo_overlays(group: ExecutionGroup, *, repos_root: Path, execute: bool) -> None: + repos_root.mkdir(parents=True, exist_ok=True) + (repos_root / ".keep").touch() + for overlay in group.repo_overlays: + dest = repos_root / repo_overlay_dir_name(overlay) + if dest.exists(): + run_or_print(["git", "-C", str(dest), "fetch", "--all", "--tags", "--force", "--prune"], execute) + else: + run_or_print(["git", "clone", overlay.url, str(dest)], execute) + run_or_print(["git", "-C", str(dest), "checkout", overlay.ref], execute) + + +def save_image(image: str, output: Path, execute: bool) -> int: + return run_or_print(["docker", "save", "-o", str(output), image], execute, mkdir=output.parent) + + +def ensure_image(image: str, *, platform: str | None = None) -> None: + if docker_image_exists(image, platform=platform): + return + suffix = f" for {platform}" if platform else "" + print(f"[docker] pulling missing base image{suffix}: {image}") + cmd = ["docker", "pull"] + if platform: + cmd.extend(["--platform", platform]) + cmd.append(image) + result = subprocess.run(cmd, check=False, cwd=REPO_ROOT) + if result.returncode: + raise SystemExit(result.returncode) + + +def docker_image_exists(image: str, *, platform: str | None = None) -> bool: + cached = docker_image_platform(image) + return cached is not None and platform_matches(cached, platform) + + +def docker_image_platform(image: str) -> str | None: + inspect = subprocess.run( + [ + "docker", + "image", + "inspect", + "--format", + "{{.Os}}/{{.Architecture}}{{if .Variant}}/{{.Variant}}{{end}}", + image, + ], + stdout=subprocess.PIPE, + text=True, + stderr=subprocess.DEVNULL, + cwd=REPO_ROOT, + ) + if inspect.returncode != 0: + return None + return (inspect.stdout.strip().splitlines() or [None])[0] + + +def platform_matches(cached: str | None, requested: str | None) -> bool: + if cached is None: + return False + if not requested: + return True + return cached == requested or cached.startswith(f"{requested}/") + + +def pip_cache_volume(platform: str | None = None) -> str: + suffix = sanitize(platform or "default") + return f"nemotron-airgap-pip-cache-{suffix}" + + +def run_or_print(cmd: list[str], execute: bool, *, mkdir: Path | None = None) -> int: + print_cmd(cmd) + if not execute: + return 0 + if mkdir is not None: + mkdir.mkdir(parents=True, exist_ok=True) + return subprocess.run(cmd, check=False, cwd=REPO_ROOT).returncode + + +def clean_stale_group_dirs(output_dir: Path, groups: Iterable[ExecutionGroup], *, execute: bool) -> None: + keep = {group.name for group in groups} + for relative in ("execution-context", "repo-overlays"): + parent = output_dir / relative + if not parent.exists(): + continue + for child in parent.iterdir(): + if not child.is_dir() or child.name in keep: + continue + if execute: + shutil.rmtree(child) + print(f"[clean] removed stale {child}") + else: + print_cmd(["rm", "-rf", str(child)]) + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def saved_image_manifest( + image: str, + output: Path, + *, + execute: bool, + role: str, + name: str, +) -> dict[str, Any]: + return { + "role": role, + "name": name, + "image": image, + "tar": str(output), + "sha256": sha256_file(output) if execute and output.exists() else None, + } + + +def print_cmd(cmd: list[str]) -> None: + print("$ " + " ".join(shlex_quote(part) for part in cmd)) + + +def shlex_quote(value: str) -> str: + import shlex + + return shlex.quote(str(value)) + + +def collect_mounts(infos: Iterable[StepInfo]) -> list[Any]: + out: list[Any] = [] + for info in infos: + out.extend(info.mounts) + return out + + +def collect_repo_overlays(infos: Iterable[StepInfo]) -> list[RepoOverlay]: + out: list[RepoOverlay] = [] + for info in infos: + out = merge_repo_overlays(out, info.repo_overlays) + return out + + +def repo_overlay_manifest(item: RepoOverlay) -> dict[str, str]: + return { + "repo": item.repo, + "url": item.url, + "ref": item.ref, + "target": item.target, + } + + +def repo_overlay_build_manifest(item: RepoOverlay) -> dict[str, str]: + data = repo_overlay_manifest(item) + data["source"] = repo_overlay_dir_name(item) + return data + + +def repo_overlay_dir_name(item: RepoOverlay) -> str: + return f"{sanitize(item.repo)}-{short_hash(repo_overlay_manifest(item))}" + + +def step_to_manifest(info: StepInfo) -> dict[str, Any]: + return { + "target": info.target.spec, + "step_py": str(info.step_py.relative_to(REPO_ROOT)), + "step_toml": str(info.step_toml.relative_to(REPO_ROOT)), + "config": str(info.config_path.relative_to(REPO_ROOT)) if info.config_path else None, + "module": info.module, + } + + +def execution_group_manifest(group: ExecutionGroup) -> dict[str, Any]: + return { + "name": group.name, + "image_names": sorted(group.image_names), + "base_image": group.base_image, + "platform": group.platform, + "tag": group.tag, + "selected_image": group.selected_image or group.tag, + "tar": str(group.tar), + "steps": group.steps, + "pip_no_deps": group.pip_no_deps, + "candidate_imports": sorted(group.candidate_imports), + "missing_imports": group.missing_imports, + "missing_core_imports": group.missing_core_imports, + "requirements": group.requirements, + "requirements_path": str(group.requirements_path) if group.requirements_path else None, + "repo_overlays": [repo_overlay_manifest(item) for item in group.repo_overlays], + "repo_overlays_path": str(group.repo_overlays_path) if group.repo_overlays_path else None, + } + + +def step_execution_image_manifest(groups: Iterable[ExecutionGroup]) -> dict[str, str]: + out: dict[str, str] = {} + for group in groups: + image = group.selected_image or group.tag + for step_id in group.steps: + out[step_id] = image + return out + + +def launcher_image_manifest(launcher_image: Mapping[str, Any]) -> dict[str, Any]: + return { + "base_image": launcher_image.get("base_image") or "python:3.12-slim", + "platform": launcher_image.get("platform"), + "tag": launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest", + "tar": launcher_image.get("tar") or "launcher-image.tar", + } + + +def sanitize(value: str) -> str: + return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-").lower() or "image" + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/customize/steps/prep/index.md b/docs/customize/steps/data_prep/index.md similarity index 51% rename from docs/customize/steps/prep/index.md rename to docs/customize/steps/data_prep/index.md index e76dc0e07..08ec89553 100644 --- a/docs/customize/steps/prep/index.md +++ b/docs/customize/steps/data_prep/index.md @@ -1,6 +1,6 @@ # Data Preparation -```{include} ../../../../src/nemotron/steps/prep/guide.md +```{include} ../../../../src/nemotron/steps/data_prep/guide.md ``` ```{toctree} diff --git a/docs/customize/steps/data_prep/sft-packing.md b/docs/customize/steps/data_prep/sft-packing.md new file mode 100644 index 000000000..552071784 --- /dev/null +++ b/docs/customize/steps/data_prep/sft-packing.md @@ -0,0 +1,23 @@ +# SFT Data Packing + +```{step-toml} src/nemotron/steps/data_prep/sft_packing/step.toml +``` + +## Reference Implementation + +```{literalinclude} ../../../../src/nemotron/steps/data_prep/sft_packing/step.py +:language: python +:caption: step.py +``` + +## Starter Configs + +```{literalinclude} ../../../../src/nemotron/steps/data_prep/sft_packing/config/default.yaml +:language: yaml +:caption: config/default.yaml +``` + +```{literalinclude} ../../../../src/nemotron/steps/data_prep/sft_packing/config/tiny.yaml +:language: yaml +:caption: config/tiny.yaml +``` diff --git a/docs/customize/steps/index.md b/docs/customize/steps/index.md index 30a9078e3..8c9fe2da7 100644 --- a/docs/customize/steps/index.md +++ b/docs/customize/steps/index.md @@ -10,7 +10,7 @@ types hardware curate/index translate/index -prep/index +data_prep/index sft/index eval/index convert/index diff --git a/docs/customize/steps/prep/sft-packing.md b/docs/customize/steps/prep/sft-packing.md deleted file mode 100644 index b375f4686..000000000 --- a/docs/customize/steps/prep/sft-packing.md +++ /dev/null @@ -1,23 +0,0 @@ -# SFT Data Packing - -```{step-toml} src/nemotron/steps/prep/sft_packing/step.toml -``` - -## Reference Implementation - -```{literalinclude} ../../../../src/nemotron/steps/prep/sft_packing/step.py -:language: python -:caption: step.py -``` - -## Starter Configs - -```{literalinclude} ../../../../src/nemotron/steps/prep/sft_packing/config/default.yaml -:language: yaml -:caption: config/default.yaml -``` - -```{literalinclude} ../../../../src/nemotron/steps/prep/sft_packing/config/tiny.yaml -:language: yaml -:caption: config/tiny.yaml -``` diff --git a/docs/customize/steps/sft/megatron-bridge.md b/docs/customize/steps/sft/megatron-bridge.md index d35822ffc..784f5f67d 100644 --- a/docs/customize/steps/sft/megatron-bridge.md +++ b/docs/customize/steps/sft/megatron-bridge.md @@ -12,9 +12,9 @@ ## Starter Configs -```{literalinclude} ../../../../src/nemotron/steps/sft/megatron_bridge/config/nano3.yaml +```{literalinclude} ../../../../src/nemotron/steps/sft/megatron_bridge/config/default.yaml :language: yaml -:caption: config/nano3.yaml +:caption: config/default.yaml ``` ```{literalinclude} ../../../../src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml diff --git a/docs/customize/steps/translate/index.md b/docs/customize/steps/translate/index.md index aaeec3de3..3a6f7a82a 100644 --- a/docs/customize/steps/translate/index.md +++ b/docs/customize/steps/translate/index.md @@ -3,5 +3,5 @@ ```{toctree} :maxdepth: 1 -translation +nemo-curator ``` diff --git a/docs/customize/steps/translate/translation.md b/docs/customize/steps/translate/nemo-curator.md similarity index 54% rename from docs/customize/steps/translate/translation.md rename to docs/customize/steps/translate/nemo-curator.md index 5c1a8cded..87e4317ad 100644 --- a/docs/customize/steps/translate/translation.md +++ b/docs/customize/steps/translate/nemo-curator.md @@ -5,7 +5,7 @@ It should stay a thin wrapper around Curator; do not generate custom chunking or pandas processing unless a single huge input file needs a one-off preprocessing stage. -```{step-toml} src/nemotron/steps/translate/translation/step.toml +```{step-toml} src/nemotron/steps/translate/nemo_curator/step.toml ``` ## Agent Checklist @@ -22,29 +22,49 @@ stage. ## CLI -Run the step directly: +Install the Curator-backed translation dependencies before running the step: ```bash -nemotron steps translation \ +uv sync --extra translate +``` + +Run the step through the generic step dispatcher with bare ``key=value`` +overrides appended at the end of the command: + +```bash +uv run --extra translate nemotron steps run translate/nemo_curator \ input_path=/path/to/source.jsonl \ output_dir=/path/to/translated \ source_language=en \ target_language=hi ``` -Use `-c` or `--config` to pass a config file or config name from the step's -`config/` directory. The CLI currently supports local execution only. +Use `-c` or `--config` to pass a config name from the step's `config/` +directory or a path to a YAML file. Trailing tokens that contain ``=`` and do +not begin with ``-`` are routed into the Hydra-style dotlist override layer. + +For batch executors such as Lepton or Slurm, add ``--batch ``: + +```bash +uv run nemotron steps run translate/nemo_curator \ + -c default \ + --batch lepton_translate \ + input_path=/mnt/lustre-shared/data/source.jsonl \ + output_dir=/mnt/lustre-shared/output/translated \ + source_language=en \ + target_language=hi +``` ## Reference Implementation -```{literalinclude} ../../../../src/nemotron/steps/translate/translation/step.py +```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_curator/step.py :language: python :caption: step.py ``` ## Starter Config -```{literalinclude} ../../../../src/nemotron/steps/translate/translation/config/default.yaml +```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_curator/config/default.yaml :language: yaml :caption: config/default.yaml ``` diff --git a/docs/customize/steps/translate/nemo-skills.md b/docs/customize/steps/translate/nemo-skills.md deleted file mode 100644 index a78f55d7f..000000000 --- a/docs/customize/steps/translate/nemo-skills.md +++ /dev/null @@ -1,18 +0,0 @@ -# Translation + FAITH Scoring (NeMo Skills) - -```{step-toml} src/nemotron/steps/translate/nemo_skills/step.toml -``` - -## Reference Implementation - -```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_skills/step.py -:language: python -:caption: step.py -``` - -## Starter Config - -```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_skills/config/default.yaml -:language: yaml -:caption: config/default.yaml -``` diff --git a/pyproject.toml b/pyproject.toml index bb45a94b0..d5895a869 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "nemotron" version = "0.1.0" description = "Reproducible training recipes for NVIDIA Nemotron model family - transparent pipelines for data preparation, training, and evaluation across all stages" -requires-python = ">=3.10" +requires-python = ">=3.10,<3.14" license = {text = "MIT"} authors = [ {name = "Nemotron Contributors"} @@ -33,7 +33,7 @@ dependencies = [ "numpy>=1.24.0", "pyarrow>=14.0.0", "xxhash>=3.4.0", - "transformers>=4.36.0", + "transformers>=4.57.6,<5.0", "huggingface_hub>=0.20.0", "datasets>=2.14.0", # Required for ray.data.from_huggingface "pyyaml>=6.0", @@ -64,15 +64,37 @@ audio = [ # `uv run --no-project` (they declare their own PEP 723 inline deps). data-sdg = ["data-designer==0.5.5; python_version>='3.11' and python_version<'3.14'"] byob = [ - # BYOB uses Curator's current translation and semantic dedup stack, which is Python 3.11+. + # BYOB CPU/runtime dependencies. GPU semantic-dedup/outlier dependencies live in `byob-gpu`. + "cosmos-xenna>=0.2,<0.3; python_version>='3.11' and python_version<'3.14'", "data-designer==0.5.5; python_version>='3.11' and python_version<'3.14'", - "nemo-curator @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'", + "nemo-curator[translation_all] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'", + "datasets>=2.14.0; python_version>='3.11'", + "numpy>=2.2,<3; python_version>='3.11'", "pandas>=2.1.0; python_version>='3.11'", - "sentence-transformers>=5.0.0,<6.0.0; python_version>='3.11'", - "scikit-learn>=1.7.0,<1.8.0; python_version>='3.11'", + "pyarrow>=14.0.0; python_version>='3.11'", + "pyyaml>=6.0; python_version>='3.11'", + "pydantic>=2.0.0; python_version>='3.11'", + "requests>=2.0.0; python_version>='3.11'", + "tqdm; python_version>='3.11'", + "urllib3>=2.7.0,<3; python_version>='3.11'", + "obstore>=0.8,<0.9; python_version>='3.11'", + "portpicker>=1.6,<2; python_version>='3.11'", + "pulp>=3.3,<4; python_version>='3.11'", + "attrs>=25.4,<26; python_version>='3.11'", + "cattrs>=25.3,<26; python_version>='3.11'", + "jinja2>=3.1,<4; python_version>='3.11'", + "loguru>=0.7,<1; python_version>='3.11'", + "tabulate>=0.9,<1; python_version>='3.11'", "sacrebleu>=2.6.0,<3.0.0; python_version>='3.11'", "iso639-lang>=2.6.0,<3.0.0; python_version>='3.11'", "bcp47>=0.1.0; python_version>='3.11' and python_version<'3.14'", +] +byob-gpu = [ + "torch>=2.10,<2.11; python_version>='3.11'", + "transformers>=4.57.6,<5.0; python_version>='3.11'", + "sentence-transformers>=5.0.0,<6.0.0; python_version>='3.11'", + "scikit-learn>=1.7.0,<1.8.0; python_version>='3.11'", + "cupy-cuda12x>=14.0,<15; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", "cuda-bindings>=12.9,<13; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", "cuda-python>=12.9,<13; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", "cudf-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", @@ -82,8 +104,29 @@ byob = [ "raft-dask-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", "rapidsmpf-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", ] +translate = [ + "nemo-curator[translation_all] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'", + "pyarrow>=14.0.0; python_version>='3.11'", + "pyyaml>=6.0; python_version>='3.11'", + "obstore>=0.8,<0.9; python_version>='3.11'", + "portpicker>=1.6,<2; python_version>='3.11'", + "pulp>=3.3,<4; python_version>='3.11'", + "attrs>=25.4,<26; python_version>='3.11'", + "cattrs>=25.3,<26; python_version>='3.11'", + "jinja2>=3.1,<4; python_version>='3.11'", + "loguru>=0.7,<1; python_version>='3.11'", + "tabulate>=0.9,<1; python_version>='3.11'", + "sacrebleu>=2.6.0,<3.0.0; python_version>='3.11'", + "bcp47>=0.1.0; python_version>='3.11' and python_version<'3.14'", +] +evaluator = ["nemo-evaluator-launcher>=0.1.0"] +curate = [ + "nemo-curator @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'", + "huggingface_hub>=0.20.0; python_version>='3.11'", + "pyyaml>=6.0; python_version>='3.11'", +] dev = [ - "pytest>=7.0.0", + "pytest>=9.0.3", "pytest-cov>=4.0.0", "mypy>=1.0.0", "ruff>=0.1.0", @@ -97,7 +140,7 @@ all = [ "webdataset>=0.2.86", "imageio-ffmpeg>=0.5.1", "data-designer==0.5.5; python_version>='3.11' and python_version<'3.14'", - "nemo-curator @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'", + "nemo-curator[translation_all] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'", "pandas>=2.1.0; python_version>='3.11'", "sentence-transformers>=5.0.0,<6.0.0; python_version>='3.11'", "scikit-learn>=1.7.0,<1.8.0; python_version>='3.11'", @@ -112,6 +155,7 @@ all = [ "pylibraft-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", "raft-dask-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", "rapidsmpf-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'", + "nemo-evaluator-launcher>=0.1.0", ] # Note: megatron-bridge is required for training but not listed as a dependency @@ -173,18 +217,116 @@ package = true constraint-dependencies = [ # Curator main currently keeps these constraints in its own uv config, but # they are not published transitively through package metadata. - "transformers>=4.56.0,<5.0", + "transformers>=4.57.6,<5.0", + "python-multipart>=0.0.29", + "cryptography>=48.0.0", + "gitpython>=3.1.50", + "pytest>=9.0.3", ] override-dependencies = [ # data-designer-engine==0.5.5 declares huggingface-hub>=1.0.1, while # Curator's Transformers-compatible stack requires the pre-1.0 API. "huggingface-hub>=0.34,<1.0", "torch==2.10.0", + # torchx==0.7.0 still caps urllib3<1.27 through nemo-run. Force the + # patched urllib3 line while we validate torchx/nemo-run compatibility. + "urllib3>=2.7.0,<3", +] + +[tool.nemotron.runtime.byob] +extras = ["byob"] +venv-name = "byob" +extra-index-urls = ["https://pypi.nvidia.com"] +omit-packages = ["nemo-curator"] +required-imports = [ + "bcp47", + "cattrs", + "data_designer", + "datasets", + "jinja2", + "loguru", + "nemo_curator", + "numpy", + "obstore", + "pandas", + "portpicker", + "pulp", + "pyarrow", + "pydantic", + "requests", + "sacrebleu", + "tabulate", + "tqdm", + "yaml", +] + +[tool.nemotron.runtime.byob-gpu] +extras = ["byob", "byob-gpu"] +venv-name = "byob-gpu" +extra-index-urls = ["https://pypi.nvidia.com"] +torch-backend = "cu128" +omit-packages = ["nemo-curator"] +required-imports = [ + "bcp47", + "cattrs", + "data_designer", + "datasets", + "jinja2", + "loguru", + "nemo_curator", + "numpy", + "obstore", + "pandas", + "portpicker", + "pulp", + "pyarrow", + "pydantic", + "requests", + "sacrebleu", + "sentence_transformers", + "sklearn", + "tabulate", + "tqdm", + "yaml", +] +# RAPIDS imports can initialize CUDA libraries. Checking import specs verifies +# wheels are present without loading CUDA libraries during bootstrap readiness. +spec-only-imports = ["cudf", "cuml", "cupy"] + +[tool.nemotron.runtime.translate] +extras = ["translate"] +venv-name = "translate" +extra-index-urls = ["https://pypi.nvidia.com"] +omit-packages = ["nemo-curator"] +required-imports = [ + "bcp47", + "cattrs", + "jinja2", + "loguru", + "nemo_curator", + "obstore", + "portpicker", + "pulp", + "pyarrow", + "sacrebleu", + "tabulate", + "yaml", +] + +[tool.nemotron.runtime.curate] +extras = ["curate"] +venv-name = "curate" +extra-index-urls = ["https://pypi.nvidia.com"] +omit-packages = ["nemo-curator"] +required-imports = [ + "huggingface_hub", + "nemo_curator", + "yaml", ] [dependency-groups] dev = [ - "pytest>=9.0.2", + "pytest>=9.0.3", ] run = [ "nemo-run @ git+https://github.com/NVIDIA-NeMo/Run.git@main", diff --git a/skills/INDEX.md b/skills/INDEX.md index cbbab7320..7b1675a11 100644 --- a/skills/INDEX.md +++ b/skills/INDEX.md @@ -20,12 +20,13 @@ its own `SKILL.md` (frontmatter + body) and lives in a sibling directory. skills/ ← workflow & reference skills (this directory) └── nemotron-customize/ ← e.g. pipeline-builder skill ├── SKILL.md ← agent entry point (Orient/Plan/Act/Verify) - ├── act/ ← codegen rules loaded during Act phase - │ ├── PROJECT.md ← project-scaffold rules (R1–R10) - │ └── STAGE.md ← per-stage rules (R1–R5, dry-run, W&B) - └── context/ ← authored library API extracts for codegen - ├── index.toml ← (step_id, intent) → pack file - └── README.md ← provenance + nv-base notes + ├── references/ + │ ├── act/ ← codegen rules loaded during Act phase + │ │ ├── PROJECT.md ← project-scaffold rules (R1–R10) + │ │ └── STAGE.md ← per-stage rules (R1–R5, dry-run, W&B) + │ └── context/ ← authored library API extracts for codegen + │ ├── index.toml ← (step_id, intent) → pack file + │ └── README.md ← provenance notes src/nemotron/steps/ ← step library (the catalog skills route into) ├── SKILL.md ← per-category routing @@ -37,7 +38,7 @@ src/nemotron/steps/ ← step library (the catalog skills route in ├── step.toml ← machine contract (consumes/produces/params/strategies/errors) ├── SKILL.md ← agent prose: when/why/gotchas (per-step) ├── step.py ← runspec + entry point - └── config/ ← default.yaml + tiny.yaml + └── config/ ← one or more named configs ``` **Rule of thumb:** @@ -48,12 +49,6 @@ src/nemotron/steps/ ← step library (the catalog skills route in ## Validation -This directory is validated by [nv-base](https://gitlab-master.nvidia.com/ai_tools/nvcarps_team/nv-base): - -```bash -nv-base validate skills/ --type skill --no-llm -r cli json -o reports/nv-base -c -``` - Every `SKILL.md` requires a YAML frontmatter block: ```markdown @@ -63,9 +58,8 @@ description: --- ``` -The 17 files under `nemotron-customize/context/*.txt` are extracted upstream -documentation from the Nemotron-stack libraries (Megatron-Bridge, AutoModel, -Curator, NeMo-RL, Speaker, Evaluator, ModelOpt, Data Designer). They contain -code snippets that nv-base flags for `Env Variable Harvesting`, `Credential -Access`, etc. — these are **documentation false positives**, not executable -code paths in this repo. +The files under `nemotron-customize/references/context/*.txt` are short +curated context packs for the Nemotron-stack libraries (Megatron-Bridge, +AutoModel, Curator, NeMo-RL, Evaluator, ModelOpt, Data Designer). They are +read-only reference material for grounding agent changes in the real library +APIs, not runtime code paths. diff --git a/skills/nemotron-customize/.claude-plugin/plugin.json b/skills/nemotron-customize/.claude-plugin/plugin.json new file mode 100644 index 000000000..a7e4aa56b --- /dev/null +++ b/skills/nemotron-customize/.claude-plugin/plugin.json @@ -0,0 +1,10 @@ +{ + "name": "nemotron-customize", + "description": "Compose runnable NVIDIA Nemotron model-customization pipelines from existing repo steps.", + "version": "0.1.0", + "author": { + "name": "NVIDIA Nemotron Team" + }, + "homepage": "https://github.com/NVIDIA/Nemotron", + "skills": ["./"] +} diff --git a/skills/nemotron-customize/SKILL.md b/skills/nemotron-customize/SKILL.md index 94090e0e3..d0e644e35 100644 --- a/skills/nemotron-customize/SKILL.md +++ b/skills/nemotron-customize/SKILL.md @@ -1,33 +1,65 @@ --- name: nemotron-customize -description: Compose runnable training pipelines from steps under src/nemotron/steps/. Plans a stage DAG, validates artifact wiring against types.toml, fires patterns, then generates a forkable Python project. Use when the user wants to fine-tune, pretrain, align, evaluate, or optimize a Nemotron-stack model end-to-end. +description: Use when building runnable Nemotron model-customization pipelines from existing repo steps and artifact contracts. +version: 0.1.0 +metadata: + author: NVIDIA Nemotron Team + tags: + - nemotron + - customization + - training + - pipelines --- # nemotron-customize -Invocation: `/nemotron-customize`. +## Purpose -You compose **steps** from [src/nemotron/steps/](../../src/nemotron/steps/) -into a runnable Python project the user owns. **The step library is the -source of truth.** This skill orchestrates — it does not duplicate per-step -knowledge. +Use this skill to turn a model-customization request into a repo-native Nemotron step pipeline. It plans the step DAG, validates artifact wiring, and creates only the YAML configs needed to run existing steps. -When you need to know what a step does, read its `step.toml` and `SKILL.md`. -When you need to know whether a chain is sound, read the patterns it cites. -When you need to write code for a stage, read `step.py` + the runner + -(if mapped in [context/index.toml](context/index.toml)) the context pack. +Use it only for inspecting, configuring, validating, running, or submitting +existing Nemotron steps or multi-step training/customization pipelines. If the +request is a frontend, dashboard, visualization, generic ML-advice, +billing/access, or unrelated coding task, stop with a short scope note and do +not inspect the step catalog or edit files in that turn. -## Tone +## Requirements -Concise. Technical. No fluff. +- A checkout of this Nemotron repo with `src/nemotron/steps/` available. +- **Invoke from the repo root.** All file paths in this document are repo-root-relative (e.g. `src/nemotron/steps/STEPS.md`, `skills/nemotron-customize/references/act/STAGE.md`). Resolve them against the user's current working directory, which must be the Nemotron checkout root. +- User-provided model, data, hardware, backend, and output constraints before writing configs. +- Backend credentials only when the selected step requires them, such as translation or W&B-enabled training. -- Status updates: ≤2 lines. -- Plan commentary: one sentence per stage, max. -- Decision explanations: tables over paragraphs. -- Never start with "Great", "Sure", "Certainly", "Of course". -- No emojis unless the user uses them first. +## Limitations ---- +- This skill does not invent new catalog steps when an existing step can satisfy the request. +- New Python or shell code is allowed only in Explorer mode after the repo capability gap is explicit. +- Post-training deployment-only requests are out of scope unless they are part of a model-customization pipeline. + +Invocation: `/nemotron-customize`. + +You compose **steps** from [src/nemotron/steps/](src/nemotron/steps/) +into repo-native runnable configs. **The current codebase is the source of +truth.** This skill orchestrates — it does not duplicate per-step knowledge. + +Priority order: + +1. Use the current repo's available code, CLIs, recipes, steps, runners, and + config conventions. +2. Create only new YAML config files needed to serve the user's request. +3. Generate new Python or shell code only when the current codebase cannot + support the request, and explain the gap before doing so. + +When you need to know what a step does, read its `step.toml` and `SKILL.md`. +When you need to know whether a chain is sound, read the patterns it cites. +When you need to configure a stage, read `step.py` + the runner + existing +configs to learn the supported YAML shape. Read context packs only if new code +is unavoidable. + +For a command request, the fast path is: verify repo root, run or read the step +catalog, read the selected `step.toml`, verify the requested config exists, +read the active env TOML for any remote profile, then emit the complete command. +Do not guess `--batch` profiles from examples or naming conventions. ## How information is split (and where to find it) @@ -36,19 +68,73 @@ Concise. Technical. No fluff. | What does step X consume / produce / parameterize? | `src/nemotron/steps///step.toml` | | When/why pick step X over its siblings? | `src/nemotron/steps///SKILL.md` | | Which step in category C should I pick? | `src/nemotron/steps//SKILL.md` | -| What runner code does step X use? | `src/nemotron/steps///step.py` → [_runners/](../../src/nemotron/steps/_runners/) | -| Cross-step constraint (tokenizer lock, eval bookends, ...) | `src/nemotron/steps/patterns/.md` | -| Artifact compatibility / `is_a` / `convert_to` | [src/nemotron/steps/types.toml](../../src/nemotron/steps/types.toml) | -| GPU memory / parallelism heuristics | [src/nemotron/steps/hardware.md](../../src/nemotron/steps/hardware.md) | -| Library API extracts for code generation | [context/index.toml](context/index.toml) → `context/.txt` | -| Project scaffold rules (CLI, pyproject, README, deploy) | [act/PROJECT.md](act/PROJECT.md) | -| Per-stage code rules (R1–R5, dry-run, W&B) | [act/STAGE.md](act/STAGE.md) | +| What runner code does step X use? | `src/nemotron/steps///step.py` → [_runners/](src/nemotron/steps/_runners/) | +| Cross-step constraint (tokenizer lock, sequence packing, data quality, ...) | `src/nemotron/steps/patterns/.md` | +| Artifact compatibility / `is_a` hierarchy | [src/nemotron/steps/types.toml](src/nemotron/steps/types.toml) | +| GPU memory / parallelism heuristics | [src/nemotron/steps/hardware.md](src/nemotron/steps/hardware.md) | +| Library API extracts for exceptional code generation | [references/context/index.toml](skills/nemotron-customize/references/context/index.toml) → `references/context/.txt` | +| Project scaffold rules, only when repo code cannot support the request | [references/act/PROJECT.md](skills/nemotron-customize/references/act/PROJECT.md) | +| Per-stage code rules, only when repo code cannot support the request | [references/act/STAGE.md](skills/nemotron-customize/references/act/STAGE.md) | If two sources say the same thing, the **deeper, more specific** one wins (`step.toml` > category `SKILL.md` > this file). --- +## Instructions + +Use this skill when the user asks for an end-to-end Nemotron-stack pipeline: +fine-tuning, continued pretraining, alignment training, data curation, +translation for training data, or other data preprocessing for model training. +Follow the workflow below in order: + +1. **Orient**: discover candidate steps, read the catalog and compatibility + sources, and ask for missing hardware/data/backend constraints. +2. **Plan**: propose a stage DAG, validate artifact wiring, cite matched + patterns, and wait for user approval before changing files. +3. **Act**: create the minimal YAML configs for the selected repo steps. + Generate code only if no current repo path can satisfy the request. +4. **Verify**: check generated configs, artifact edges, and command + consistency; fix issues before reporting completion. + +Do not treat this skill as general ML advice. The step library under +[src/nemotron/steps/](src/nemotron/steps/) is the source of truth. + +For single-step command questions, use this shorter flow instead of the full +pipeline workflow: + +1. Confirm the repo root has `pyproject.toml` and `src/nemotron/steps/`. +2. Run `uv run nemotron steps list --json` when available; otherwise read + [STEPS.md](src/nemotron/steps/STEPS.md). +3. Read the selected step's `step.toml` and the requested checked-in config. +4. For remote execution, read `NEMOTRON_ENV_FILE` or a repo-root `env*.toml` + and choose an actual section name whose profile matches the step. +5. Return the command first, followed by only the rationale needed to explain + config/profile choices. + +For translation-only command requests, also read +[src/nemotron/steps/translate/SKILL.md](src/nemotron/steps/translate/SKILL.md) +and return `Decision`, `Config`, `Run`, `Output`, and `Env`. Do not continue +broad repository exploration once those fields are execution-ready. + +Source tiers for command answers: + +- **Verified**: CLI, manifest, config, env profile, and dry-run all succeeded. +- **Repo-grounded**: manifest, config, and env profile were read, but dry-run + could not be executed. +- **Blocked**: a required repo file or env TOML is missing; name it and stop + before emitting a guessed remote command. + +Canonical commands: + +```bash +uv run nemotron steps run -c --dry-run +uv run nemotron steps run -c --dry-run --batch +uv run nemotron steps run -c --batch +``` + +--- + ## Workflow Four phases, in order: **Orient → Plan → Act → Verify.** Never skip Verify. @@ -63,35 +149,27 @@ Goal: enumerate candidate steps and gather the user's constraints in one pass. machine-readable: ```bash -nemotron step list --json # all steps -nemotron step list --json --category sft # by category -nemotron step list --json --consumes training_jsonl # by input type -nemotron step list --json --produces checkpoint_megatron # by output type -nemotron step show # full manifest +nemotron steps list --json # all steps +nemotron steps list --json --category sft # by category +nemotron steps list --json --consumes training_jsonl # by input type +nemotron steps list --json --produces checkpoint_megatron # by output type +nemotron steps show # full manifest ``` -Implementation: [list_cmd.py](../../src/nemotron/cli/commands/step/list_cmd.py), -[show_cmd.py](../../src/nemotron/cli/commands/step/show_cmd.py), -[run_cmd.py](../../src/nemotron/cli/commands/step/run_cmd.py). - -Per-step JSON schema: `{id, name, category, description, tags, path, -consumes:[{type,required,description}], produces:[...], parameters:[...]}`. - **Step 1.2 — Read these in parallel** (small files, all cheap): -- [src/nemotron/steps/STEPS.md](../../src/nemotron/steps/STEPS.md) — auto-generated catalog (always read first). -- [src/nemotron/steps/PATTERNS.md](../../src/nemotron/steps/PATTERNS.md) — auto-generated pattern index. -- [src/nemotron/steps/types.toml](../../src/nemotron/steps/types.toml) — artifact compatibility graph (`is_a`, `convert_to`). -- [src/nemotron/steps/hardware.md](../../src/nemotron/steps/hardware.md) — GPU heuristics if hardware is in scope. +- [src/nemotron/steps/STEPS.md](src/nemotron/steps/STEPS.md) — auto-generated catalog (always read first). +- [src/nemotron/steps/PATTERNS.md](src/nemotron/steps/PATTERNS.md) — auto-generated pattern index. +- [src/nemotron/steps/types.toml](src/nemotron/steps/types.toml) — artifact compatibility graph (`is_a` hierarchy). +- [src/nemotron/steps/hardware.md](src/nemotron/steps/hardware.md) — GPU heuristics if hardware is in scope. **Step 1.3 — For each candidate category, descend one level**: - `src/nemotron/steps//SKILL.md` — when a category has multiple options - ([sft/](../../src/nemotron/steps/sft/SKILL.md), - [pretrain/](../../src/nemotron/steps/pretrain/SKILL.md), - [peft/](../../src/nemotron/steps/peft/SKILL.md), - [rl/nemo_rl/](../../src/nemotron/steps/rl/nemo_rl/SKILL.md), - [optimize/modelopt/](../../src/nemotron/steps/optimize/modelopt/SKILL.md)). + ([sft/](src/nemotron/steps/sft/SKILL.md), + [pretrain/](src/nemotron/steps/pretrain/SKILL.md), + [peft/](src/nemotron/steps/peft/SKILL.md), + [rl/nemo_rl/](src/nemotron/steps/rl/nemo_rl/SKILL.md)). **Step 1.4 — For each candidate step, read its `step.toml`** end-to-end. You're after: `[[consumes]]`, `[[produces]]`, `[[parameters]]`, @@ -109,9 +187,8 @@ Present as a numbered list, replies as numbers or Enter for `[defaults]`: 3. Data size (rough): \_\_\_ examples 4. GPUs: count + type + nodes (e.g. `8x H100, 1 node`) 5. Backend preference: `[nemo-run]` / plain Python -6. Deploy: `[local only]` / Airflow / Kubeflow -7. W&B: `[off]` / on (project name?) -8. Output: `[.//]` / current dir +6. W&B: `[off]` / on (project name?) +7. Output: `[.//]` / current dir **Never assume hardware, data availability, or framework. Ask.** @@ -132,107 +209,68 @@ Goal: produce a markdown plan the user reviews before any code is written. - Strategies fired (the `when:` clauses from `step.toml` that match). - Patterns cited (from `src/nemotron/steps/patterns/`). -**Step 2.3 — Run preflight validation.** Each item is a hard check: - -| # | Check | Source of truth | -|---|---|---| -| 1 | Every `consumes.type` matches an upstream `produces.type` (direct or via `is_a`). | [types.toml](../../src/nemotron/steps/types.toml) | -| 2 | If a chain breaks, insert the right converter step. | `convert_to` in [types.toml](../../src/nemotron/steps/types.toml) → [convert/megatron_to_hf](../../src/nemotron/steps/convert/megatron_to_hf/), [convert/hf_to_megatron](../../src/nemotron/steps/convert/hf_to_megatron/), [convert/merge_lora](../../src/nemotron/steps/convert/merge_lora/) | -| 3 | Tokenizer + chat template + seq_length consistent across prep ↔ train ↔ RL ↔ eval. | [patterns/prep-data-is-tokenizer-locked.md](../../src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md), [patterns/sft-sequence-packing.md](../../src/nemotron/steps/patterns/sft-sequence-packing.md) | -| 4 | LoRA outputs are merged before eval/RL. | [patterns/peft-adapter-merge-discipline.md](../../src/nemotron/steps/patterns/peft-adapter-merge-discipline.md) | -| 5 | Eval bookends present (before + after training). | [patterns/eval-before-and-after-training.md](../../src/nemotron/steps/patterns/eval-before-and-after-training.md) | -| 6 | RL warm-starts from SFT; rewards validated before scale. | [patterns/rl-validate-rewards-before-scale.md](../../src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md) | -| 7 | GPU count ≥ chosen model's `min_gpus` (from `[[models]]` block in each `step.toml`). | step.toml + [hardware.md](../../src/nemotron/steps/hardware.md) | -| 8 | Sovereign / customization patterns checked: `cpt-data-blend-scoping`, `sft-data-blending`, `multilingual-tokenizer-check`, `data-quality-before-quantity`, `sdg-pipeline-versioning`, `byob-benchmark-design`, `pretrain-token-budget-before-scale`, `sft-small-dataset-prefer-lora`, `convert-checkpoint-safety`. | [patterns/](../../src/nemotron/steps/patterns/) | - -When a check fails: surface it as a `⚠` warning in the plan and propose a +**Step 2.3 — Run preflight validation.** Hard checks: artifact types chain via [types.toml](src/nemotron/steps/types.toml); tokenizer/template/sequence length align across prep and train; RL warm-starts from SFT; GPU count satisfies the selected model; applicable patterns are cited. When a check fails: surface it as a `WARNING:` warning in the plan and propose a fix. When the user can't satisfy it (e.g. hardware), propose alternatives in descending preference: smaller model → AutoModel instead of Megatron-Bridge → LoRA instead of full FT. -**Step 2.4 — Plan format:** - -````markdown -# Pipeline Plan: - -## Intent - - -## Stages -```mermaid -graph LR - A[01_curate] -->|filtered_jsonl| B[02_prep] - B -->|packed_parquet| C[03_sft] - C -->|checkpoint_megatron| D[04_eval] -``` - -### 1. / -- Consumes: from -- Produces: -- Key params: <2–3 from step.toml> -- Strategies fired: -- Patterns cited: - - - -## Validation (preflight) -✓ Artifact chain -✓ Tokenizer / template / seq_length consistency -✓ Eval bookends present -✓ GPU count ≥ min_gpus -✓ All applicable patterns acknowledged -⚠ - -## Infrastructure -| Resource | Required by | Notes | -|---|---|---| -| | | | -```` +**Step 2.4 — Plan format.** Include `Intent`, `Stages`, `Validation`, and `Infrastructure`. Use a Mermaid graph for artifact flow, one short stage block per step, and explicit `PASS:` / `WARNING:` validation lines. **Step 2.5 — Present the plan and wait.** Don't proceed to Act until the -user approves or requests changes. +user approves or requests changes. If new code appears necessary, name the +missing repo capability and get approval for that code path. --- ### Phase 3 — Act -Goal: produce a complete, runnable Python project. No placeholders. No TODOs. +Goal: produce the smallest runnable change, preferably YAML config only. No +placeholders. No TODOs. -**Step 3.1 — Load codegen rules.** +**Step 3.1 — Prefer the existing repo execution path.** -- Main agent reads [act/PROJECT.md](act/PROJECT.md) (project scaffold rules). -- Each per-stage sub-agent reads [act/STAGE.md](act/STAGE.md) (R1–R5 + - code-quality + dry-run + W&B). +Before creating any code, identify how the existing repo can run each stage: + +- CLI commands under [src/nemotron/cli/](src/nemotron/cli/). +- Step entrypoints in `src/nemotron/steps///step.py`. +- Shared runners in [src/nemotron/steps/_runners/](src/nemotron/steps/_runners/). +- Existing configs under the selected step, recipe, or runner directory. -**Step 3.2 — Main agent generates the scaffold:** +**Step 3.2 — Generate only YAML configs when the repo supports the request.** ``` / -├── pyproject.toml -├── .python-version # "3.12" -├── README.md # with mermaid + stage table -├── env.toml.example -├── / -│ ├── __init__.py -│ ├── __main__.py # `from .cli import app; app()` -│ ├── cli.py # Typer; one cmd per stage + `all` -│ └── stages/ # populated by sub-agents -└── .generated/ - ├── pipeline.toml # canonical stage graph - ├── SKILL.md # invocable as / (with frontmatter) - └── plugin.json # .claude-plugin manifest +├── configs/ +│ └── .yaml # user-specific config for an existing step +└── README.md # optional: only if the user asks for run docs ``` -Naming: `` is kebab-case (skill invocation, DAG name); -`` is snake_case (Python identifier). +Naming: `` is kebab-case. YAML filenames should match approved +stage names. + +Each YAML config must: + +- Match keys read by the existing `step.py` and runner code. +- Adapt existing default/tiny configs instead of inventing a schema. +- Use user-provided paths, model IDs, hardware, backend, and W&B settings. +- Preserve artifact compatibility from the approved plan. + +**Step 3.3 — Only use codegen when YAML cannot satisfy the request.** -**Step 3.3 — For each stage, spawn one sub-agent in parallel:** +If the repo lacks a callable step, runner, CLI, or config surface for the +requested behavior, load codegen rules: + +- Main agent reads [references/act/PROJECT.md](skills/nemotron-customize/references/act/PROJECT.md) (project scaffold rules). +- Each per-stage sub-agent reads [references/act/STAGE.md](skills/nemotron-customize/references/act/STAGE.md) (R1–R5 + + code-quality + dry-run + W&B). + +Then implement the missing stage with the narrowest possible code change: ``` You are implementing stage _ = . Load: - - skills/nemotron-customize/act/STAGE.md + - skills/nemotron-customize/references/act/STAGE.md - # from context/index.toml; OPTIONAL — skip if not mapped - src/nemotron/steps///step.py # primary code shape - src/nemotron/steps/_runners/.py # if step.py imports a shared runner @@ -248,86 +286,59 @@ Deliverables (exactly these): - run.py - __init__.py - config/default.yaml - - config/tiny.yaml + - config/tiny.yaml, or the step's checked-in smoke config name such as config/tiny_chat.yaml for eval/model_eval Report back: files written, knobs exposed, UPSTREAM notes, strategies followed. ``` -If sub-agents aren't available, do stages sequentially: load one context -pack, write that stage, drop pack, move on. +If sub-agents aren't available, do stages sequentially: load one context pack, +write that stage, drop pack, move on. -**Step 3.4 — Step.py + the runner are the reference.** Don't invent library -APIs from memory. Mirror what the in-repo code does: +**Step 3.4 — Step.py + the runner are the reference.** Don't invent YAML keys +or library APIs from memory. Mirror what the in-repo code does: -- [steps/_runners/megatron_bridge.py](../../src/nemotron/steps/_runners/megatron_bridge.py) — used by sft/peft/pretrain Megatron-Bridge steps. -- [steps/_runners/automodel.py](../../src/nemotron/steps/_runners/automodel.py) — used by AutoModel steps. -- [steps/_runners/nemo_rl.py](../../src/nemotron/steps/_runners/nemo_rl.py) — used by all NeMo-RL alignment steps. -- [steps/_runners/modelopt.py](../../src/nemotron/steps/_runners/modelopt.py) — used by quantize/prune/distill. +- [steps/_runners/megatron_bridge.py](src/nemotron/steps/_runners/megatron_bridge.py) — used by sft/peft/pretrain Megatron-Bridge steps. +- [steps/_runners/automodel.py](src/nemotron/steps/_runners/automodel.py) — used by AutoModel steps. +- [steps/_runners/nemo_rl.py](src/nemotron/steps/_runners/nemo_rl.py) — used by all NeMo-RL alignment steps. -For steps without a context pack (`sft/megatron_bridge`, `eval/model_eval`, -`curate/nemo_curator`, `translate/nemo_skills`, `convert/*`), the agent -combines: per-step `SKILL.md` + `step.toml [[strategies]]` + `step.py` + the -URLs in `[reference]`. That's enough. +When a step has no context pack, the agent combines: per-step `SKILL.md` + `step.toml [[strategies]]` + `step.py` + the URLs in `[reference]`. That is enough. --- ### Phase 4 — Verify -Goal: every preflight check holds against the *generated files*, not just -the plan. +Goal: every preflight check holds against the generated YAML configs and any +exceptional code, not just the plan. Run through: -- [ ] Every stage script has valid Python syntax (no placeholder functions). -- [ ] Every import references a real module from the step's reference code. -- [ ] Every `config/*.yaml` is valid; keys match what `run.py` reads. -- [ ] `.generated/pipeline.toml` matches the generated `stages/` dirs. +- [ ] Every generated `*.yaml` is valid; keys match the existing step/runner code. - [ ] Artifact wiring is consistent (stage N output type = stage N+1 input type). -- [ ] `pyproject.toml` covers every imported third-party package. -- [ ] `README.md` mermaid matches the actual stages. -- [ ] `tiny.yaml` configs use reduced iters, batch sizes, max_steps. -- [ ] Tokenizer + seq_length aligned across prep ↔ train ↔ eval YAMLs. -- [ ] No `${art:...}` references leaked into generated configs (those belong only in [src/nemotron/recipes/](../../src/nemotron/recipes/)). +- [ ] Existing CLI or runner commands can consume the generated configs. +- [ ] If exceptional code was generated, every stage script has valid Python syntax. +- [ ] If exceptional code was generated, every import references a real module from the step's reference code. +- [ ] If a README was generated, its commands match the actual configs. +- [ ] Smoke-test YAML configs use reduced iters, batch sizes, max_steps. +- [ ] Tokenizer + seq_length aligned across prep ↔ train YAMLs. +- [ ] No `${art:...}` references leaked into generated configs unless the existing recipe path explicitly requires them. If verification finds issues, fix them silently. Don't say "I noticed an issue." --- -## Operational nuances (not in patterns/) - -These are generation-time concerns, not ML decision rules. Patterns own ML -rules; this section owns what *this skill specifically* does. - -### `tiny.yaml` is for plumbing, not metrics +## Operational Nuances -Each step ships `config/default.yaml` (production) and `config/tiny.yaml` -(smoke test: handful of iters, micro batch, tiny seqlen). Generated projects -must mirror this and **default the CLI to `default`**. tiny is for verifying -the wiring runs end-to-end on a cheap budget — never for evidence of model -quality. +- Smoke configs such as `tiny.yaml` or eval/model_eval's `tiny_chat.yaml` are for wiring tests, not model-quality evidence. +- If a `step.toml` strategy points to unavailable upstream docs, use its `then:` text and mark the plan for manual review. +- Preserve `${art:...}` only in recipe-backed configs; standalone YAML should use plain paths. +- Keep pretraining `bin/idx` data and `blend.json` from the same Nemotron release. -### Strategy `skill:` pointers may not resolve +## Examples -Many `[[strategies]]` blocks in `step.toml` carry a `skill:` pointer -(`Megatron-Bridge/skills/perf-techniques/...`, `Automodel/docs/guides/...`). -Those paths live in upstream repos, not here. If you can't read them, **don't -fail** — use the `then:` text as guidance and put a `⚠` in the plan: "Could -not read perf-tuning docs for `` — config may need manual review." - -### `${art:...}` belongs only to recipes/, not generated projects - -The reference recipes under [src/nemotron/recipes/](../../src/nemotron/recipes/) -use `${art:data,path}`, `${art:model,iteration}` for W&B-Artifacts lineage. -**Don't propagate `${art:...}` into generated stage configs** — they get -plain DATA_ROOT layout instead (see [act/PROJECT.md](act/PROJECT.md) R2). - -### `bin/idx + blend.json` is version-coupled - -Pretraining data prep produces `binidx` plus a `blend.json` manifest. The -`pretrain/megatron_bridge` step reads it via `dataset.data_paths`. **The two -must come from the same Nemotron release** — don't mix a freshly-prepped -blend with a six-month-old recipe. When the user can't reprep, surface a -`⚠`. +- Single step: read the manifest/config/env profile, then return a complete + `uv run nemotron steps run -c --dry-run` command. +- Pipeline: plan the step DAG first, validate artifact edges, then create only + the project YAML overlays needed for the approved stages. --- @@ -337,94 +348,53 @@ blend with a six-month-old recipe. When the user can't reprep, surface a Fast path. Levels 0 → 2 in Orient, then Plan → Act. -`STEPS.md → category/SKILL.md → step.toml → step.py → write code` +`STEPS.md → category/SKILL.md → step.toml → step.py → adapt YAML config` Use whenever the user's request maps to a step in the catalog. -### Explorer mode — no step, but a library supports it +### Explorer mode — no repo path supports it -1. Look at libraries cited in nearby `step.toml [reference]` URLs. -2. Read the relevant library docs / examples. -3. Use [types.toml](../../src/nemotron/steps/types.toml) to type the new +1. Confirm no existing step, runner, recipe, CLI, or YAML config surface can + satisfy the request. +2. Look at libraries cited in nearby `step.toml [reference]` URLs. +3. Read the relevant library docs / examples. +4. Use [types.toml](src/nemotron/steps/types.toml) to type the new stage's consumes/produces. -4. Write the stage from scratch, mirroring an existing `step.py` as a template. +5. Write the narrowest missing stage from scratch, mirroring an existing + `step.py` as a template. Tell the user: "This use case doesn't have a pre-built step. I'll build it from `` docs — the output will need more validation than a catalog-based stage." If the same Explorer build keeps appearing across projects, suggest the user -run `/nemotron-add-step` to land it in the catalog. +contribute it as a new catalog step under `src/nemotron/steps/`. ### Choosing a mode | User says | Mode | |---|---| | "SFT with Megatron-Bridge / AutoModel" | Catalog | -| "Distill / quantize / prune a model" | Catalog ([optimize/modelopt/*](../../src/nemotron/steps/optimize/modelopt/)) | -| "DPO / RLVR / GRPO / RLHF" | Catalog ([rl/nemo_rl/*](../../src/nemotron/steps/rl/nemo_rl/)) | -| "Synthesize preference / SFT data" | Catalog ([sdg/data_designer](../../src/nemotron/steps/sdg/data_designer/)) | -| "Translate EN → \" | Catalog ([translate/nemo_skills](../../src/nemotron/steps/translate/nemo_skills/)) | -| "Curate web text" | Catalog ([curate/nemo_curator](../../src/nemotron/steps/curate/nemo_curator/)) | -| "Deploy to TensorRT-LLM" | Explorer (no step yet — derive from upstream library docs and add a `convert/*` step if the path stabilizes) | +| "DPO / RLVR / GRPO / RLHF" | Catalog ([rl/nemo_rl/*](src/nemotron/steps/rl/nemo_rl/)) | +| "Synthesize preference / SFT data" | Catalog ([sdg/data_designer](src/nemotron/steps/sdg/data_designer/)) | +| "Translate EN → \ for training data" | Catalog ([translate/nemo_curator](src/nemotron/steps/translate/nemo_curator/)) | +| "Curate web text" | Catalog ([curate/nemo_curator](src/nemotron/steps/curate/nemo_curator/)) | | "Train with X exotic backend" | Explorer or **ask** | +| Post-training-only request | Out of scope for this skill; ask the user to use a more appropriate workflow. | | Ambiguous | **Ask** | --- -## Domain vocabulary - -### Step vs stage - -- **Step** = abstract building block in [src/nemotron/steps/](../../src/nemotron/steps/) (e.g. "SFT with Megatron-Bridge"). No position, no customer config. -- **Stage** = a step instantiated in a generated project (e.g. "stage 03: SFT for Thai Nano3"). Has a number, wired inputs, customer-specific YAML. - -Use "step" for the catalog, "stage" for the generated project. - -### Artifact graph - -``` -raw_jsonl ─is_a─> training_jsonl ─prep─> packed_parquet ─sft─> checkpoint_megatron - │ - convert_to - ▼ - checkpoint_hf ─eval─> eval_results -``` - -Definitions in [types.toml](../../src/nemotron/steps/types.toml). - -### Config hierarchy - -``` -config/default.yaml → recipe defaults → CLI overrides -``` - -Plain OmegaConf YAML + `parse_hydra_overrides`. **Never** generate Hydra -configs. - ---- - -## Tool preferences - -- **Catalog discovery**: `nemotron step list --json --consumes ` — don't grep `**/step.toml`. -- **Manifest read**: `nemotron step show ` — fastest single read. -- **Context packs**: load one large pack per stage via Act sub-agent — beats many small reads. -- **Step.py read**: full file — they're <100 lines. -- **Type validation**: read [types.toml](../../src/nemotron/steps/types.toml) once during Orient; keep in context through Verify. -- **Parallel reads**: batch step.toml + category SKILL.md reads. - ---- - ## Boundaries ### Do - Build pipelines from steps that exist; cite step.toml fields directly. +- Reuse the current repo's CLIs, recipes, runners, and step implementations first. - Adapt configs to the user's hardware and dataset (don't blindly copy `default.yaml`). - Fire strategies and follow `skill:` pointers when perf-tuning. -- Insert converter steps when artifact types don't chain. -- Ask about hardware, data, deploy target — never assume. -- Generate both `default.yaml` and `tiny.yaml` for every stage. +- Ask about hardware, data, backend, and output path — never assume. +- Generate only the YAML configs needed for the approved request. - Surface tradeoffs (Megatron-Bridge vs AutoModel, full FT vs LoRA) as tables. - Present the plan and wait for approval. @@ -432,12 +402,14 @@ configs. - Invent steps. Use Explorer mode or ask. - Skip Plan for any pipeline ≥2 stages. +- Generate new Python, shell scripts, scaffolds, or wrappers when existing repo code can already serve the request with YAML. - Import from modules not present in the step's reference code. - Add monitoring / logging / W&B unless the user asks. - Tune parallelism beyond what `hardware.md` and `[[strategies]]` advise. - Assume GPU count, type, or interconnect. -- Generate Slurm/Airflow/Kubeflow wrappers unless requested. -- Modify [src/nemotron/steps/](../../src/nemotron/steps/). To extend the catalog, route the user to `/nemotron-add-step`. +- Generate Slurm/Airflow/Kubeflow wrappers. +- Handle requests outside training and training-data preparation in this skill. +- Modify [src/nemotron/steps/](src/nemotron/steps/). To extend the catalog, point the user to the contribution workflow in `CONTRIBUTING.md`. - Restate per-step rules in this skill — link to the step's `SKILL.md` instead. --- @@ -446,19 +418,17 @@ configs. | Situation | Action | |---|---| -| No step matches the user's request | Check libraries cited in nearby `step.toml [reference]`. If supported, use Explorer mode. Otherwise ask. | -| Artifact types won't chain | Look up `convert_to` in [types.toml](../../src/nemotron/steps/types.toml). If a converter exists, add it. Otherwise: explain the gap and ask. | -| Strategy points to a missing skill file | Skip the load. Use the `then:` text as guidance. Note in plan: "⚠ Could not read perf-tuning docs for `` — config may need manual review." | +| No existing repo path matches the user's request | Check libraries cited in nearby `step.toml [reference]`. If supported, use Explorer mode. Otherwise ask. | +| Artifact types won't chain | Explain the gap and ask the user whether to change the training/data-prep plan. Do not add post-training work here. | +| Strategy points to a missing skill file | Skip the load. Use the `then:` text as guidance. Note in plan: "WARNING: Could not read perf-tuning docs for `` — config may need manual review." | | User's hardware is too small | Show the relevant `[[models]]` `min_gpus` table. Suggest in order: smaller model → AutoModel → LoRA. | | Two failed Act attempts | Stop. Explain what was tried, what failed, ask the user how to proceed. | -| User wants a feature that crosses 3+ projects | Build it Explorer-mode for them now. Then suggest `/nemotron-add-step` to land it in the catalog. | - ---- +| User wants a feature that crosses 3+ projects | Confirm YAML and existing repo code cannot serve it. If not, build it Explorer-mode for them now, then suggest contributing it as a new step under `src/nemotron/steps/`. | -## Related skills +## Troubleshooting -- **[/nemotron-nano3](../nemotron-nano3/SKILL.md)** — facts about Nano3 (architecture, data, recipes, eval). Hands off here for "build me a pipeline." -- **[/nemotron-super3](../nemotron-super3/SKILL.md)** — facts about Super3. -- **[/nemotron-add-step](../nemotron-add-step/SKILL.md)** — extend the step catalog when Explorer mode keeps recurring. -- **[/nemotron-add-pattern](../nemotron-add-pattern/SKILL.md)** — encode a new cross-cutting decision rule. -- **[/nemotron-add-model](../nemotron-add-model/SKILL.md)** — onboard a new model family. +| Symptom | Action | +|---|---| +| Artifact types do not chain | Recheck `types.toml` and change the DAG before writing configs | +| Remote profile is unclear | Read the active env TOML; do not guess `--batch` | +| Config key is unclear | Read the step config, `step.py`, and shared runner before editing | diff --git a/skills/nemotron-customize/context/automodel-launcher-executor-modes.txt b/skills/nemotron-customize/context/automodel-launcher-executor-modes.txt deleted file mode 100644 index 85bc99e46..000000000 --- a/skills/nemotron-customize/context/automodel-launcher-executor-modes.txt +++ /dev/null @@ -1,2163 +0,0 @@ - -/Users/mromeijn/src/Automodel -├── docs -│ ├── launcher -│ │ ├── local-workstation.md * -│ │ ├── nemo-run.md * -│ │ ├── overview.md * -│ │ ├── skypilot.md * -│ │ └── slurm.md * -│ ├── about -│ ├── guides -│ │ ├── diffusion -│ │ ├── llm -│ │ ├── omni -│ │ └── vlm -│ └── model-coverage -│ ├── diffusion -│ │ ├── black-forest-labs -│ │ ├── hunyuanvideo-community -│ │ └── wan-ai -│ ├── llm -│ │ ├── allenai -│ │ ├── baai -│ │ ├── baichuan-inc -│ │ ├── bigcode -│ │ ├── bytedance-seed -│ │ ├── cohere -│ │ ├── deepseek-ai -│ │ ├── eleutherai -│ │ ├── google -│ │ ├── ibm -│ │ ├── inceptionai -│ │ ├── internlm -│ │ ├── lgai-exaone -│ │ ├── meta -│ │ ├── microsoft -│ │ ├── minimax -│ │ ├── mistralai -│ │ ├── moonshotai -│ │ ├── nvidia -│ │ ├── openai -│ │ ├── openbmb -│ │ ├── orionstar -│ │ ├── parasail-ai -│ │ ├── qwen -│ │ ├── stabilityai -│ │ ├── stepfun-ai -│ │ ├── thudm -│ │ ├── tiiuae -│ │ └── upstage -│ ├── omni -│ │ ├── microsoft -│ │ └── qwen -│ └── vlm -│ ├── google -│ ├── huggingface -│ ├── internlm -│ ├── llava-hf -│ ├── meta -│ ├── mistralai -│ ├── moonshotai -│ ├── nvidia -│ └── qwen -├── examples -│ ├── llm_finetune -│ │ ├── llama3_2 -│ │ │ ├── llama3_2_1b_squad.yaml * -│ │ │ └── llama3_2_1b_squad_skypilot.yaml * -│ │ ├── baichuan -│ │ ├── cohere -│ │ ├── deepseek_v32 -│ │ ├── devstral -│ │ ├── falcon -│ │ ├── gemma -│ │ ├── glm -│ │ ├── gpt_oss -│ │ ├── granite -│ │ ├── llama3_1 -│ │ ├── llama3_3 -│ │ ├── minimax_m2 -│ │ ├── mistral -│ │ ├── moonlight -│ │ ├── nemotron -│ │ ├── nemotron_flash -│ │ ├── olmo -│ │ ├── phi -│ │ ├── qwen -│ │ ├── seed -│ │ ├── starcoder -│ │ └── stepfun -│ ├── convergence -│ │ └── tulu3 -│ │ ├── data -│ │ ├── eval -│ │ ├── inference -│ │ ├── model-verification -│ │ ├── models -│ │ │ ├── gpt-oss-20b -│ │ │ │ └── assets -│ │ │ ├── moonlight-16b -│ │ │ │ └── assets -│ │ │ ├── qwen3-4b -│ │ │ │ └── assets -│ │ │ └── qwen3-moe-30b -│ │ │ ├── assets -│ │ │ └── experiments -│ │ └── training -│ ├── diffusion -│ │ ├── finetune -│ │ ├── generate -│ │ │ └── configs -│ │ └── pretrain -│ ├── dllm_generate -│ ├── dllm_sft -│ ├── llm_benchmark -│ │ ├── deepseek -│ │ ├── glm -│ │ ├── gpt_oss -│ │ ├── kimi -│ │ ├── llama3_3 -│ │ ├── minimax -│ │ ├── mistral -│ │ ├── moonlight -│ │ ├── nemotron -│ │ ├── qwen -│ │ └── step -│ ├── llm_kd -│ │ └── llama3_2 -│ ├── llm_pretrain -│ ├── llm_seq_cls -│ │ └── glue -│ ├── retrieval -│ │ ├── bi_encoder -│ │ │ └── llama_embed_nemotron_8b -│ │ ├── cross_encoder -│ │ └── data_utils -│ ├── vlm_benchmark -│ │ ├── kimi -│ │ ├── mistral -│ │ └── qwen -│ ├── vlm_finetune -│ │ ├── gemma3 -│ │ ├── gemma3n -│ │ ├── gemma4 -│ │ ├── internvl -│ │ ├── kimi -│ │ ├── mistral -│ │ ├── mistral4 -│ │ ├── nemotron -│ │ ├── phi4 -│ │ ├── qwen2_5 -│ │ ├── qwen3 -│ │ ├── qwen3_5 -│ │ └── qwen3_5_moe -│ └── vlm_generate -├── nemo_automodel -│ ├── components -│ │ ├── launcher -│ │ │ ├── nemo_run -│ │ │ │ ├── config.py * + -│ │ │ │ └── launcher.py * + -│ │ │ ├── skypilot -│ │ │ │ ├── config.py * + -│ │ │ │ └── launcher.py * + -│ │ │ ├── base.py * + -│ │ │ └── interactive.py * + -│ │ ├── _peft -│ │ ├── attention -│ │ ├── checkpoint -│ │ │ └── _backports -│ │ ├── config -│ │ ├── datasets -│ │ │ ├── diffusion -│ │ │ ├── dllm -│ │ │ ├── llm -│ │ │ │ └── megatron -│ │ │ └── vlm -│ │ ├── distributed -│ │ │ └── pipelining -│ │ ├── flow_matching -│ │ │ └── adapters -│ │ ├── loggers -│ │ ├── loss -│ │ │ └── triton -│ │ ├── models -│ │ │ ├── baichuan -│ │ │ ├── common -│ │ │ ├── deepseek_v3 -│ │ │ ├── deepseek_v32 -│ │ │ ├── gemma4_moe -│ │ │ ├── glm4_moe -│ │ │ ├── glm4_moe_lite -│ │ │ ├── glm_moe_dsa -│ │ │ ├── gpt_oss -│ │ │ ├── kimi_k25_vl -│ │ │ ├── kimivl -│ │ │ ├── llama -│ │ │ ├── llama_bidirectional -│ │ │ ├── minimax_m2 -│ │ │ ├── mistral3 -│ │ │ ├── mistral4 -│ │ │ ├── nemotron_parse -│ │ │ ├── nemotron_v3 -│ │ │ ├── qwen2 -│ │ │ ├── qwen3_5_moe -│ │ │ ├── qwen3_moe -│ │ │ ├── qwen3_next -│ │ │ ├── qwen3_omni_moe -│ │ │ ├── qwen3_vl_moe -│ │ │ └── step3p5 -│ │ ├── moe -│ │ │ ├── megatron -│ │ │ └── uccl_ep -│ │ ├── optim -│ │ ├── quantization -│ │ ├── training -│ │ └── utils -│ ├── _diffusers -│ ├── _transformers -│ │ └── tokenization -│ ├── autonvtx -│ ├── cli -│ ├── recipes -│ │ ├── diffusion -│ │ ├── dllm -│ │ ├── llm -│ │ ├── retrieval -│ │ └── vlm -│ └── shared -├── .github -│ ├── actions -│ │ ├── build-container -│ │ └── test-template -│ └── workflows -│ └── config -├── docker -│ └── common -├── scripts -├── skills -│ ├── .claude -│ │ └── skills -│ │ ├── developer-guide -│ │ ├── distributed-training -│ │ ├── launcher-config -│ │ ├── model-onboarding -│ │ ├── parity-testing -│ │ └── recipe-development -│ ├── developer-guide -│ ├── distributed-training -│ ├── launcher-config -│ ├── model-onboarding -│ ├── parity-testing -│ └── recipe-development -├── tests -│ ├── ci_tests -│ │ ├── configs -│ │ │ ├── llm_benchmark -│ │ │ ├── llm_finetune -│ │ │ ├── vlm_benchmark -│ │ │ └── vlm_finetune -│ │ ├── golden_values -│ │ │ ├── llm_finetune -│ │ │ │ ├── baichuan -│ │ │ │ ├── falcon -│ │ │ │ ├── gemma -│ │ │ │ ├── glm -│ │ │ │ ├── gpt_oss -│ │ │ │ ├── granite -│ │ │ │ ├── llama3_1 -│ │ │ │ ├── llama3_2 -│ │ │ │ ├── mistral -│ │ │ │ ├── moonlight -│ │ │ │ ├── nemotron -│ │ │ │ ├── nemotron_flash -│ │ │ │ ├── olmo -│ │ │ │ ├── phi -│ │ │ │ ├── qwen -│ │ │ │ ├── seed -│ │ │ │ └── starcoder -│ │ │ └── vlm_finetune -│ │ │ ├── gemma3 -│ │ │ ├── gemma3n -│ │ │ ├── internvl -│ │ │ ├── mistral -│ │ │ ├── nemotron -│ │ │ ├── qwen2_5 -│ │ │ ├── qwen3 -│ │ │ └── qwen3_5_moe -│ │ ├── scripts -│ │ └── utils -│ ├── functional_tests -│ │ ├── checkpoint -│ │ ├── checkpoint_robustness -│ │ ├── context_parallel -│ │ ├── data -│ │ │ └── llm -│ │ ├── datasets -│ │ │ └── llm -│ │ ├── hf_dcp -│ │ ├── hf_peft -│ │ ├── hf_transformer -│ │ ├── hf_transformer_finetune -│ │ ├── hf_transformer_llm -│ │ ├── hf_transformer_vlm -│ │ ├── llm_pretrain_and_kd -│ │ │ ├── customizer_retrieval -│ │ │ ├── llm_seq_cls -│ │ │ └── loss -│ │ ├── retrieval -│ │ └── training -│ ├── unit_tests -│ │ ├── _cli -│ │ ├── _diffusers -│ │ ├── _peft -│ │ ├── _transformers -│ │ ├── attention -│ │ ├── checkpoint -│ │ ├── components -│ │ │ └── training -│ │ ├── config -│ │ ├── datasets -│ │ │ ├── diffusion -│ │ │ ├── dllm -│ │ │ ├── llm -│ │ │ └── vlm -│ │ ├── diffusion_processors -│ │ ├── distributed -│ │ │ └── pipelining -│ │ ├── flow_matching -│ │ │ └── adapters -│ │ ├── launcher -│ │ ├── loggers -│ │ ├── loss -│ │ ├── models -│ │ │ ├── baichuan -│ │ │ ├── bi_encoder -│ │ │ ├── common -│ │ │ ├── deepseek_v3 -│ │ │ ├── deepseek_v32 -│ │ │ ├── gemma4 -│ │ │ ├── glm4_moe -│ │ │ ├── glm4_moe_lite -│ │ │ ├── glm_moe_dsa -│ │ │ ├── gpt_oss -│ │ │ ├── kimi_k25_vl -│ │ │ ├── kimivl -│ │ │ ├── llama -│ │ │ ├── minimax_m2 -│ │ │ ├── mistral3 -│ │ │ ├── mistral4 -│ │ │ ├── nemotron_parse -│ │ │ ├── nemotron_v3 -│ │ │ ├── qwen2 -│ │ │ ├── qwen3_5 -│ │ │ ├── qwen3_5_moe -│ │ │ ├── qwen3_moe -│ │ │ ├── qwen3_next -│ │ │ ├── qwen3_omni_moe -│ │ │ ├── qwen3_vl_moe -│ │ │ └── step3p5 -│ │ ├── moe -│ │ ├── optim -│ │ ├── quantization -│ │ ├── recipes -│ │ │ ├── dllm -│ │ │ └── llm -│ │ ├── shared -│ │ ├── tools -│ │ ├── training -│ │ └── utils -│ └── utils -├── tools -│ └── diffusion -│ ├── data -│ └── processors -└── tutorials - └── nemotron-parse - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; selected files shown. - - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/base.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod -from pathlib import Path -from typing import Any, Dict, List, Optional - - -class Launcher(ABC): - """Base class for all job launchers (interactive, SLURM, SkyPilot, nemo-run).""" - - @abstractmethod - def launch( - self, - config: Dict[str, Any], - config_path: Path, - recipe_target: str, - launcher_config: Any, - extra_args: Optional[List[str]] = None, - ) -> int: - """Launch a recipe job. - - Args: - config: Parsed YAML config dict (without the launcher section). - config_path: Resolved path to the original YAML file. - recipe_target: Dotted import path of the recipe class. - launcher_config: Launcher-specific configuration (dict, int, or None). - extra_args: Additional CLI overrides forwarded to the recipe. - - Returns: - Process exit code (0 = success). - """ - ... - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/interactive.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib -import logging -import os -from pathlib import Path -from typing import Any, Dict, List, Optional - -from nemo_automodel.components.launcher.base import Launcher - -logger = logging.getLogger(__name__) - - -def _get_repo_root() -> Path: - """Return the repository root. If CWD looks like an editable checkout, - prepend it to ``PYTHONPATH`` so the local source takes precedence.""" - cwd = Path.cwd() - if (cwd / "nemo_automodel/components").exists() and (cwd / "examples/").exists(): - new_pp = str(cwd) - if "PYTHONPATH" in os.environ: - new_pp += ":" + os.environ["PYTHONPATH"] - os.environ["PYTHONPATH"] = new_pp - logger.info("Running job using source from: %s", cwd) - return cwd - return Path(__file__).parents[3] - - -def resolve_recipe_cls(target_str: str): - """Import and return the recipe class from a dotted path. - - " pip install nemo-automodel # CPU/basic\n" - " pip install nemo-automodel[all] # with CUDA & all extras\n\n" - """ - module_path, cls_name = target_str.rsplit(".", 1) - module = importlib.import_module(module_path) - return getattr(module, cls_name) - - -def _recipe_module_path(recipe_target: str, repo_root: Path) -> Path: - """Convert a dotted recipe target into an absolute filesystem path.""" - module_path = recipe_target.rsplit(".", 1)[0] - relative = module_path.replace(".", "/") + ".py" - return repo_root / relative - - -_INSTALL_MSG = ( - "Local/interactive execution requires PyTorch and the full nemo_automodel package.\n" - "It looks like you have the lightweight CLI-only install (automodel[cli]).\n\n" - "To run jobs locally, install the full package:\n" - " pip install nemo_automodel # CPU/basic\n" - " pip install nemo_automodel[all] # with CUDA & all extras\n\n" - "For SLURM clusters, use sbatch with the reference slurm.sub script.\n" - "For SkyPilot or NeMo-Run, add a skypilot: or nemo_run: section to your YAML.\n\n" - "See: https://github.com/NVIDIA/NeMo-Automodel#readme" -) - - -class InteractiveLauncher(Launcher): - """Launch a recipe locally on the current node using torchrun or in-process.""" - - @staticmethod - def _is_torchrun_worker() -> bool: - """Return True when this process was already spawned by torchrun. - - torchrun (``torch.distributed.run``) sets both ``LOCAL_RANK`` and - ``TORCHELASTIC_RUN_ID`` in the environment of every worker it spawns. - We check for both to avoid false positives from environments (e.g. - SLURM) that may set ``LOCAL_RANK`` without an active torchrun session. - - When the user launches the CLI via - ``torchrun --nproc-per-node N -m nemo_automodel.cli.app config.yaml``, - each worker must run the recipe in-process instead of re-launching torchrun. - """ - return "LOCAL_RANK" in os.environ and "TORCHELASTIC_RUN_ID" in os.environ - - def _run_recipe_in_process(self, recipe_target: str, config: Dict[str, Any]) -> int: - """Instantiate and run a recipe in the current process.""" - recipe_cls = resolve_recipe_cls(recipe_target) - recipe = recipe_cls(config) - recipe.setup() - return recipe.run_train_validation_loop() - - def launch( - self, - config: Dict[str, Any], - config_path: Path, - recipe_target: str, - launcher_config: Any = None, - extra_args: Optional[List[str]] = None, - ) -> int: - try: - from torch.distributed.run import determine_local_world_size, get_args_parser - from torch.distributed.run import run as thrun - except ImportError: - logger.error(_INSTALL_MSG) - return 1 - - # Already inside a torchrun worker (e.g. user ran - # ``torchrun --nproc-per-node N -m nemo_automodel.cli.app config.yaml``). - # Run the recipe directly; do NOT re-launch torchrun. - if self._is_torchrun_worker(): - logger.info( - "Detected existing torchrun environment (LOCAL_RANK=%s); running recipe in-process.", - os.environ["LOCAL_RANK"], - ) - return self._run_recipe_in_process(recipe_target, config) - - nproc_per_node: Optional[int] = launcher_config - repo_root = _get_repo_root() - script_path = _recipe_module_path(recipe_target, repo_root) - - num_devices = determine_local_world_size(nproc_per_node="gpu") - assert num_devices > 0, "Expected num-devices to be > 0" - - if nproc_per_node == 1 or num_devices == 1: - logger.info("Launching job locally on a single device") - return self._run_recipe_in_process(recipe_target, config) - else: - effective_nproc = nproc_per_node if nproc_per_node is not None else num_devices - logger.info("Launching job locally on %d devices", effective_nproc) - - torchrun_parser = get_args_parser() - torchrun_args, _ = torchrun_parser.parse_known_args() - torchrun_args.training_script = str(script_path) - torchrun_args.training_script_args = ["-c", str(config_path)] - if extra_args: - torchrun_args.training_script_args.extend(extra_args) - torchrun_args.nproc_per_node = effective_nproc - return thrun(torchrun_args) - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/nemo_run/config.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import os -from dataclasses import dataclass, field - -# Default path to user-defined executor definitions. -# Respects the NEMORUN_HOME env var used by nemo-run itself (defaults to ~/.nemo_run). -_NEMORUN_HOME = os.environ.get("NEMORUN_HOME", os.path.join(os.path.expanduser("~"), ".nemo_run")) -DEFAULT_EXECUTORS_FILE = os.path.join(_NEMORUN_HOME, "executors.py") - -# Keys that belong to NemoRunConfig itself (not executor overrides). -_LAUNCHER_KEYS = frozenset( - { - "executor", - "job_name", - "detach", - "tail_logs", - "executors_file", - "job_dir", - "overrides", - } -) - - -@dataclass -class NemoRunConfig: - """Configuration for the NeMo-Run launcher backend. - - The ``executor`` field selects a named executor from - ``$NEMORUN_HOME/executors.py``, or ``"local"`` for local execution. - - Any key not recognised as a launcher setting is collected into - ``overrides`` and applied directly to the executor via ``setattr``. - This means any executor attribute (``nodes``, ``partition``, - ``container_image``, ``time``, ``env_vars``, etc.) can be overridden - from YAML without changes to this config class. - """ - - # Executor selection: name from EXECUTOR_MAP or "local" - executor: str = "local" - - # Job metadata - job_name: str = "" - - # Experiment behaviour - detach: bool = True - tail_logs: bool = False - - # Path to executor definitions file - executors_file: str = field(default_factory=lambda: DEFAULT_EXECUTORS_FILE) - - # Local directory for job artifacts (config snapshot, logs) - job_dir: str = "" - - # Arbitrary executor attribute overrides (e.g. nodes, partition, - # container_image, time, env_vars). Populated automatically from - # unrecognised YAML keys by ``from_dict``. - overrides: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, d: dict) -> "NemoRunConfig": - """Build from a raw YAML dict, splitting launcher keys from executor overrides.""" - launcher_kwargs = {} - overrides = {} - for k, v in d.items(): - if k in _LAUNCHER_KEYS: - launcher_kwargs[k] = v - else: - overrides[k] = v - launcher_kwargs.setdefault("overrides", {}).update(overrides) - return cls(**launcher_kwargs) - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/nemo_run/launcher.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import sys -import time as _time -from pathlib import Path -from typing import Any, Dict, List, Optional - -import yaml - -from nemo_automodel.components.launcher.base import Launcher -from nemo_automodel.components.launcher.nemo_run.config import NemoRunConfig -from nemo_automodel.components.launcher.nemo_run.utils import ( - apply_overrides, - load_executor_from_file, - submit_nemo_run_job, -) - -logger = logging.getLogger(__name__) - -# Config filename and its path inside the container (/nemo_run/code/). -_CONFIG_FILENAME = "automodel_config.yaml" -_REMOTE_CONFIG_PATH = f"/nemo_run/code/{_CONFIG_FILENAME}" - - -class NemoRunLauncher(Launcher): - """Launch a recipe via NeMo-Run's executor API. - - Supports loading pre-configured executors from ``$NEMORUN_HOME/executors.py`` - (or a custom path) and submitting jobs as ``nemo_run.Script`` objects. - Works with any NeMo-Run executor backend (Slurm, Kubernetes, Docker, local). - - Uses NeMo-Run's native ``Torchrun`` launcher so that distributed training - arguments (rendezvous, node rank, nproc-per-node) are managed automatically. - The training config YAML is packaged via ``PatternPackager`` so it is - available at ``/nemo_run/code/automodel_config.yaml`` inside the container. - """ - - def _resolve_executor(self, nr_config: NemoRunConfig) -> Any: - """Load a named executor or build a local one.""" - try: - import nemo_run as run - except ImportError: - logger.error("nemo-run is not installed. Install with: pip install nemo-run") - sys.exit(1) - - if nr_config.executor == "local": - executor = run.LocalExecutor() - apply_overrides(executor, nr_config.overrides) - return executor - - # Named executor from executors file - executor = load_executor_from_file(nr_config.executor, nr_config.executors_file) - apply_overrides(executor, nr_config.overrides) - return executor - - @staticmethod - def _configure_torchrun(executor: Any, devices: int) -> None: - """Enable the native NeMo-Run Torchrun launcher on *executor*. - - Sets ``executor.launcher = "torchrun"`` and - ``torchrun_nproc_per_node`` so NeMo-Run generates the correct - ``torchrun --nproc-per-node=`` invocation in the sbatch script. - """ - executor.launcher = "torchrun" - if hasattr(executor, "torchrun_nproc_per_node"): - executor.torchrun_nproc_per_node = devices - - @staticmethod - def _setup_packager(executor: Any, config_path: str) -> None: - """Configure a ``PatternPackager`` that ships the config YAML. - - The packager tars the config file and NeMo-Run extracts it into - ``{job_dir}/code/``, which is mounted at ``/nemo_run/code/`` inside - the container. - """ - try: - import nemo_run as run - except ImportError: - return - - config_dir = os.path.dirname(config_path) - executor.packager = run.PatternPackager( - include_pattern=config_path, - relative_path=config_dir, - ) - - def launch( - self, - config: Dict[str, Any], - config_path: Path, - recipe_target: str, - launcher_config: Dict[str, Any], - extra_args: Optional[List[str]] = None, - ) -> int: - try: - import nemo_run as run - except ImportError: - logger.error("nemo-run is not installed. Install with: pip install nemo-run") - sys.exit(1) - - nr_config = NemoRunConfig.from_dict(launcher_config) - executor = self._resolve_executor(nr_config) - - # Determine devices (GPUs per node) via the executor's standard - # nproc_per_node() method (defined on the base Executor class and - # implemented by every backend). - try: - devices = executor.nproc_per_node() - except (NotImplementedError, AttributeError): - devices = 1 - - # Enable native Torchrun launcher (must be set *before* experiment.run - # because NeMo-Run reads it during the packaging phase). - self._configure_torchrun(executor, devices) - - # -- Write the training config for both local record and packaging. -- - job_dir = os.path.join( - nr_config.job_dir or os.path.join(os.getcwd(), "nemo_run_jobs"), - str(int(_time.time())), - ) - os.makedirs(job_dir, exist_ok=True) - config_yaml = yaml.dump(config, default_flow_style=False, sort_keys=False) - - # Local record. - local_config_path = os.path.join(job_dir, _CONFIG_FILENAME) - with open(local_config_path, "w") as fp: - fp.write(config_yaml) - logger.info("NeMo-Run job artifacts in: %s", job_dir) - - # Set up PatternPackager so the config is shipped to the remote. - self._setup_packager(executor, local_config_path) - - # Build the Script: use ``python -m `` so the recipe is resolved - # from the installed package, not a relative file path. - module_path = recipe_target.rsplit(".", 1)[0] - args = ["-c", _REMOTE_CONFIG_PATH] - if extra_args: - args.extend(extra_args) - - script = run.Script( - path=module_path, - m=True, - entrypoint="python", - args=args, - ) - job_name = nr_config.job_name or f"{recipe_target.rsplit('.', 1)[-1]}" - - return submit_nemo_run_job( - script=script, - executor=executor, - job_name=job_name, - detach=nr_config.detach, - tail_logs=nr_config.tail_logs, - ) - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/skypilot/config.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import os -from dataclasses import dataclass, field - -SUPPORTED_CLOUDS = ("aws", "gcp", "azure", "lambda", "kubernetes") - - -@dataclass -class SkyPilotConfig: - # Required: cloud provider - cloud: str = field(metadata=dict(help=f"Cloud provider. One of: {SUPPORTED_CLOUDS}")) - - # Compute resources - accelerators: str = field(default="T4:1", metadata=dict(help="GPU type and count per node, e.g. 'T4:1', 'A100:8'")) - num_nodes: int = field(default=1, metadata=dict(help="Number of nodes for distributed training")) - use_spot: bool = field(default=True, metadata=dict(help="Use spot/preemptible instances for cost savings")) - disk_size: int = field(default=100, metadata=dict(help="Disk size in GB")) - instance_type: str | None = field( - default=None, metadata=dict(help="Specific cloud instance type; auto-selected if None") - ) - - # Cloud location - region: str | None = field(default=None, metadata=dict(help="Cloud region")) - zone: str | None = field(default=None, metadata=dict(help="Availability zone within the region")) - - # Job identity - job_name: str = field(default="", metadata=dict(help="Job and SkyPilot cluster name")) - - # Remote environment - setup: str = field(default="", metadata=dict(help="Shell commands run on the remote VM before training starts")) - hf_home: str = field( - default="~/.cache/huggingface", - metadata=dict(help="HuggingFace cache directory on the remote VM"), - ) - - # Credentials (sourced from env by default, never hard-coded) - hf_token: str = field( - default_factory=lambda: os.environ.get("HF_TOKEN", ""), - metadata=dict(help="HuggingFace token for gated model access"), - ) - wandb_key: str = field( - default_factory=lambda: os.environ.get("WANDB_API_KEY", ""), - metadata=dict(help="Weights & Biases API key"), - ) - env_vars: dict[str, str] = field( - default_factory=dict, - metadata=dict(help="Additional environment variables to set on the remote VM"), - ) - - # Training command (set programmatically by the launcher, not exposed in YAML) - command: str = field(default="", metadata=dict(help="Training command executed on the remote VM")) - - def __post_init__(self) -> None: - if self.cloud.lower() not in SUPPORTED_CLOUDS: - raise ValueError(f"'cloud' must be one of {SUPPORTED_CLOUDS}, got: {self.cloud!r}") - if self.num_nodes < 1: - raise ValueError(f"'num_nodes' must be >= 1, got: {self.num_nodes}") - if self.disk_size < 1: - raise ValueError(f"'disk_size' must be >= 1 GB, got: {self.disk_size}") - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/skypilot/launcher.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import time -from pathlib import Path -from typing import Any, Dict, List, Optional - -import yaml - -from nemo_automodel.components.launcher.base import Launcher -from nemo_automodel.components.launcher.skypilot.config import SkyPilotConfig -from nemo_automodel.components.launcher.skypilot.utils import REMOTE_CONFIG_PATH - -logger = logging.getLogger(__name__) - - -def _parse_gpus_per_node(accelerators: str) -> int: - """Extract GPU count from an accelerator string like ``'A100:8'``. - - Returns 1 when the string cannot be parsed. - """ - parts = accelerators.split(":") - if len(parts) == 2: - try: - return int(parts[1]) - except ValueError: - pass - return 1 - - -def _recipe_module_path(recipe_target: str, repo_root: str) -> str: - module_path = recipe_target.rsplit(".", 1)[0] - return os.path.join(repo_root, module_path.replace(".", "/") + ".py") - - -class SkyPilotLauncher(Launcher): - """Launch a recipe job on a cloud VM via SkyPilot.""" - - def _build_command( - self, - recipe_target: str, - job_conf_path: str, - gpus_per_node: int, - num_nodes: int, - extra_args: Optional[List[str]] = None, - ) -> str: - repo_root = "~/sky_workdir" - script_path = _recipe_module_path(recipe_target, repo_root) - - parts = [ - f"PYTHONPATH={repo_root}:$PYTHONPATH", - "torchrun", - f"--nproc_per_node={gpus_per_node}", - ] - - if num_nodes > 1: - parts += [ - "--nnodes=$SKYPILOT_NUM_NODES", - "--node_rank=$SKYPILOT_NODE_RANK", - "--rdzv_backend=c10d", - "--master_addr=$(echo $SKYPILOT_NODE_IPS | head -n1)", - "--master_port=12375", - ] - - parts += [script_path, "-c", job_conf_path] - - if extra_args: - parts.extend(extra_args) - - return " ".join(parts) - - def launch( - self, - config: Dict[str, Any], - config_path: Path, - recipe_target: str, - launcher_config: Dict[str, Any], - extra_args: Optional[List[str]] = None, - ) -> int: - from nemo_automodel.components.launcher.skypilot.utils import submit_skypilot_job - - skypilot_cfg = dict(launcher_config) - - job_dir = os.path.join( - skypilot_cfg.pop("job_dir", os.path.join(os.getcwd(), "skypilot_jobs")), - str(int(time.time())), - ) - os.makedirs(job_dir, exist_ok=True) - - # Write the training config (without skypilot section) for upload. - job_conf_path = os.path.join(job_dir, "job_config.yaml") - with open(job_conf_path, "w") as fp: - yaml.dump(config, fp, default_flow_style=False, sort_keys=False) - logger.info("SkyPilot job artifacts in: %s", job_dir) - - accelerators = skypilot_cfg.get("accelerators", "T4:1") - gpus_per_node = skypilot_cfg.pop("gpus_per_node", None) or _parse_gpus_per_node(accelerators) - num_nodes = skypilot_cfg.get("num_nodes", 1) - - command = self._build_command( - recipe_target, - REMOTE_CONFIG_PATH, - gpus_per_node, - num_nodes, - extra_args=extra_args, - ) - - job_name = skypilot_cfg.pop("job_name", "") or f"{recipe_target.rsplit('.', 1)[-1]}" - - sky_config = SkyPilotConfig( - command=command, - job_name=job_name, - **{k: v for k, v in skypilot_cfg.items() if k in SkyPilotConfig.__dataclass_fields__}, - ) - - return submit_skypilot_job(sky_config, job_dir) - -``` - -File: /Users/mromeijn/src/Automodel/docs/launcher/overview.md -```md -# Job Launchers - -NeMo AutoModel provides several ways to launch training. The right choice depends on your hardware and environment. - -## Which Launcher Should I Use? - -| Launcher | Best for | GPUs | Guide | -|---|---|---|---| -| **Local Workstation** | Getting started, debugging, single-node training | 1-8 on one machine | [Local Workstation](./local-workstation.md) | -| **Slurm** | Multi-node batch jobs on HPC clusters | 8+ across nodes | [Slurm](./slurm.md) | -| **NeMo-Run** | Managed execution on Slurm, Kubernetes, Docker, local | 1+ | [NeMo-Run](./nemo-run.md) | -| **SkyPilot** | Cloud training (AWS, GCP, Azure) with spot pricing | Any | [SkyPilot](./skypilot.md) | - -### I have 1-2 GPUs on my workstation - -Use the **interactive** launcher. No scheduler or cluster software needed: - -```bash -automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -See the [Local Workstation](./local-workstation.md) guide. - -### I have access to a Slurm cluster - -Add a `slurm:` section to your YAML config and submit with the same `automodel` command. The CLI generates the `torchrun` invocation and calls `sbatch` for you: - -```bash -automodel config_with_slurm.yaml -``` - -See the [Slurm](./slurm.md) guide. - -### I want managed job submission (Slurm, Kubernetes, Docker) - -Add a `nemo_run:` section to your YAML config. NeMo-Run loads a pre-configured executor for your compute target and submits the job: - -```bash -automodel config_with_nemo_run.yaml -``` - -See the [NeMo-Run](./nemo-run.md) guide. - -### I want to train on the cloud - -Add a `skypilot:` section to your YAML config. SkyPilot provisions VMs on any major cloud and handles spot-instance preemption automatically: - -```bash -automodel config_with_skypilot.yaml -``` - -See the [SkyPilot](./skypilot.md) guide. - -## All Launchers Use the Same Config - -Every launcher shares the same YAML recipe format. The only difference is an optional launcher section (`slurm:`, `nemo_run:`, or `skypilot:`) that tells the CLI where to run. Without a launcher section, training runs interactively on the current machine. - -``` - -File: /Users/mromeijn/src/Automodel/docs/launcher/local-workstation.md -```md -# Run on Your Local Workstation - -Use this guide for local, single-node workflows on a workstation or an interactive Slurm allocation. For setup details, refer to our [Installation Guide](../guides/installation.md). -For batch multi-node jobs, see the [Slurm](./slurm.md) or [SkyPilot](./skypilot.md) guides. - -NeMo AutoModel uses recipes to run end-to-end workflows. If you're new to recipes, see the [Repository Structure](../repository-structure.md) guide. - -## Quick Start: Choose Your Job Launch Option - -- **CLI (recommended)** - ```bash - automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml - ``` - -- **Direct recipe script** - - Single GPU - ```bash - python nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml - ``` - - Multi-GPU (single node) - ```bash - torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml - ``` - -## Run with AutoModel CLI (Single Node) - -The AutoModel CLI is the preferred method for most users. It offers a unified interface to launch training scaling from a local workstation (this guide) to large clusters (see our [cluster guide](./slurm.md)). - -### Basic Usage - -The CLI follows this format: -```bash -automodel [--nproc-per-node N] [--key.subkey=override ...] -``` - -A short alias `am` is also available. Both commands also work with `uv run` (e.g., `uv run automodel `). - -Where: -- ``: Path to your YAML configuration file (must contain a `recipe._target_` key) -- `--nproc-per-node`: Optional override for the number of GPUs to use - -The recipe class is specified inside the YAML via the `recipe._target_` key: -```yaml -recipe: - _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction -``` - -### Train on a Single GPU - -For simple fine-tuning on a single GPU: - -```bash -automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -### Train on Multiple GPUs (Single Node) - -For interactive single-node jobs, the CLI automatically detects the number of available GPUs and -uses `torchrun` for multi-GPU training. You can manually specify the number of GPUs using the `--nproc-per-node` option: - -```bash -automodel --nproc-per-node 2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -If you don't specify `--nproc-per-node`, it will use all available GPUs on your system. - -Looking for Slurm or cloud training? See [Slurm](./slurm.md) or [SkyPilot](./skypilot.md). - -## Run with uv (Development Mode) - -When you need more control over the environment or are actively developing with the codebase, you can use `uv` to run training scripts directly. This approach gives you direct access to the underlying Python scripts and is ideal for debugging or customization. - -### Train on a Single GPU - -```bash -uv run nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -### Train on Multiple GPUs with Torchrun (Single Node) - -For multi-GPU single-node training, use `torchrun` directly: - -```bash -uv run torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -### Why Use uv? - -uv provides several advantages for development and experimentation: - -- **Automatic environment management**: uv automatically creates and manages virtual environments, ensuring consistent dependencies without manual setup. -- **Lock file synchronization**: Keeps your local environment perfectly synchronized with the project's `uv.lock` file. -- **No installation required**: Run scripts directly from the repository without installing packages system-wide. -- **Development flexibility**: Direct access to Python scripts for debugging, profiling, and customization. -- **Dependency isolation**: Each project gets its own isolated environment, preventing conflicts. - -## Run with Torchrun - -If you have NeMo AutoModel installed in your environment and prefer to run recipes directly without uv, you can use `torchrun` directly: - -### Train on a Single GPU - -```bash -python nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -### Train on Multiple GPUs (Single Node) - -```bash -torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -``` - -This approach requires that you have already installed NeMo AutoModel and its dependencies in your Python environment (see the [installation guide](../guides/installation.md) for details). - -## Customize Configuration Settings - -All approaches use the same YAML configuration files. You can easily customize training by following the steps in this section. - -1. **Override config values**: Use command-line arguments to directly replace default settings. -For example, if you want to fine-tune `Qwen/Qwen3-0.6B` instead of `meta-llama/Llama-3.2-1B`, you can use: - ```bash - automodel config.yaml --model.pretrained_model_name_or_path Qwen/Qwen3-0.6B - ``` - -2. **Edit the config file**: Modify the YAML directly for persistent changes. - -3. **Create custom configs**: Copy and modify existing configurations from the `examples/` directory. - -## When to Use Which Approach - -**Use the AutoModel CLI when:** -- You want a simple, unified interface -- You are running locally on a single machine -- You don't need to modify the underlying code -- You prefer a higher-level abstraction - -**Use uv when:** -- You're developing or debugging the codebase -- You want automatic dependency management -- You need maximum control over the execution -- You want to avoid manual environment setup -- You're experimenting with custom modifications - -**Use Torchrun when:** -- You have a stable, pre-configured environment -- You prefer explicit control over Python execution -- You're working in environments where uv is not available -- You're integrating with existing PyTorch workflows - -All approaches use the same configuration files and provide the same training capabilities on a single node. For multi-node training, see [Run on a Cluster](./slurm.md). - -``` - -File: /Users/mromeijn/src/Automodel/docs/launcher/slurm.md -```md -# Run on a Cluster - -In this guide, you will learn how to submit distributed training jobs on Slurm clusters (single- or multi-node). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md). For setup details, refer to our [Installation Guide](../guides/installation.md). - -NeMo AutoModel uses recipes to run end-to-end workflows. If you're new to recipes, see the [Repository Structure](../repository-structure.md) guide. - - -## Quickstart - -```bash -# Edit the reference script for your cluster, then submit: -cp slurm.sub my_cluster.sub -vim my_cluster.sub -sbatch my_cluster.sub -``` - -For interactive testing on a Slurm node: - - Single node, single GPU - ```bash - automodel your_config.yaml - ``` - - Single node, multiple GPUs - ```bash - automodel --nproc-per-node 8 your_config.yaml - ``` - -## Submit a Batch Job with Slurm - -SLURM clusters vary widely: some use Pyxis containers, others use -Singularity/Apptainer, and many run bare-metal with environment modules. -Instead of trying to cover all variations in code, AutoModel provides a -reference sbatch script that you copy and adapt to your cluster. - -### Getting Started - -1. Copy the reference script: - -```bash -cp slurm.sub my_cluster.sub -``` - -2. Edit `my_cluster.sub` — change `CONFIG`, `#SBATCH` directives (account, - partition, nodes, time), container runtime, mounts, and secrets for your - cluster. - -3. Submit the job: - -```bash -sbatch my_cluster.sub -``` - -### How It Works - -The reference `slurm.sub` script: - -1. Sets `CONFIG` to point at your YAML recipe config -2. Allocates nodes via SBATCH directives -3. Sets up the multi-node environment (`MASTER_ADDR`, `MASTER_PORT`) -4. Runs `torchrun -m nemo_automodel.cli.app $CONFIG` on each node via `srun` -5. Each torchrun worker detects the distributed environment and runs the recipe in-process - -All cluster-specific configuration (SBATCH directives, container runtime, -mounts, NCCL tuning, secrets) lives in your sbatch script where you can see -and edit it directly. - - -### Examples - -**Pyxis container (NVIDIA clusters):** - -```bash -#!/bin/bash -#SBATCH -A my_account -#SBATCH -p batch -#SBATCH -t 01:00:00 -#SBATCH -N 8 -#SBATCH --gpus-per-node=8 -#SBATCH --ntasks-per-node=1 -#SBATCH -J automodel-finetune -#SBATCH --output=slurm_jobs/%x_%j.out -#SBATCH --error=slurm_jobs/%x_%j.err - -CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml - -CONT=/lustre/fsw/images/automodel.sqsh -CONT_NAME=automodel-training -CONT_MOUNT="\ -/home/$USER/Automodel:/opt/Automodel,\ -/home/$USER/.cache/huggingface:/root/.cache/huggingface" - -export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) -export MASTER_PORT=13742 - -srun \ - --container-name="${CONT_NAME}" \ - --container-image="${CONT}" \ - --container-mounts="${CONT_MOUNT}" \ - --container-entrypoint \ - --no-container-mount-home \ - --export=ALL \ - bash -c "\ - cd /opt/Automodel && \ - torchrun \ - --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \ - --nnodes=\${SLURM_NNODES:-1} \ - --rdzv_backend=c10d \ - --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \ - -m nemo_automodel.cli.app ${CONFIG}" -``` - -**Bare-metal (no container):** - -```bash -#!/bin/bash -#SBATCH -A my_account -#SBATCH -p gpu -#SBATCH -N 2 -#SBATCH --gpus-per-node=8 -#SBATCH --time=01:00:00 - -CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml - -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=13742 - -module load cuda/12.8 -source /opt/venvs/automodel/bin/activate - -srun bash -c "\ - torchrun \ - --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \ - --nnodes=\${SLURM_NNODES:-1} \ - --rdzv_backend=c10d \ - --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \ - -m nemo_automodel.cli.app ${CONFIG}" -``` - -**Apptainer / Singularity:** - -```bash -#!/bin/bash -#SBATCH -A my_account -#SBATCH -p gpu -#SBATCH -N 2 -#SBATCH --gpus-per-node=8 -#SBATCH --time=01:00:00 - -CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml - -export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) -export MASTER_PORT=13742 - -srun apptainer exec --nv /shared/images/automodel.sif \ - bash -c "\ - torchrun \ - --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \ - --nnodes=\${SLURM_NNODES:-1} \ - --rdzv_backend=c10d \ - --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \ - -m nemo_automodel.cli.app ${CONFIG}" -``` - - -### Launch with Modified Code - -If the script is executed from within a Git repository accessible to Slurm -workers, automodel will use the repository source over the installation -inside the container image (it prepends `$CWD` to `PYTHONPATH` when it -detects an editable checkout). - -```bash -git clone git@github.com:NVIDIA-NeMo/Automodel.git automodel_test_repo -cd automodel_test_repo/ -sbatch slurm.sub -``` - -## Customize Configuration Settings - -You can customize training by following the steps in this section. - -1. **Override config values**: Edit the `CONFIG` variable and add CLI overrides - in your torchrun command inside the sbatch script. For example, to change - the model: - ```bash - -m nemo_automodel.cli.app ${CONFIG} --model.pretrained_model_name_or_path Qwen/Qwen3-0.6B - ``` - -2. **Edit the config file**: Modify the YAML directly for persistent changes. - -3. **Create custom configs**: Copy and modify existing configurations from the `examples/` directory. - -For single-node workflows, see our [Run on Your Local Workstation](./local-workstation.md) guide. - -``` - -File: /Users/mromeijn/src/Automodel/docs/launcher/nemo-run.md -```md -# Run with NeMo-Run - -In this guide, you will learn how to launch NeMo AutoModel training jobs using [NeMo-Run](https://github.com/NVIDIA/NeMo-Run). NeMo-Run supports multiple backends including Slurm, Kubernetes, Docker, and local execution. For cloud-based training, see [Run on Any Cloud with SkyPilot](./skypilot.md). For direct sbatch usage, see [Run on a Cluster (Slurm)](./slurm.md). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md). - -NeMo-Run is an open-source tool from NVIDIA that manages job submission across different execution backends. You define your compute configuration once in a Python file and reuse it across all your training jobs. - -## Before You Begin - -1. **Install NeMo-Run** (it is not bundled with AutoModel): - -```bash -pip install nemo-run -``` - -2. **Create an executor definitions file** at `$NEMORUN_HOME/executors.py`. `NEMORUN_HOME` defaults to `~/.nemo_run`; set the environment variable to use a different location. This file tells NeMo-Run how to reach your compute target. Every executor you reference in a YAML config must be defined here. See [Executor Setup](#executor-setup) for a complete example. - -3. **Verify connectivity** to the target in your executor (e.g. SSH for Slurm, kubeconfig for Kubernetes). - -4. **Set required environment variables** (if needed by your training config): - -```bash -export HF_TOKEN=hf_... # Required for gated models (e.g. Llama) -export WANDB_API_KEY=... # Optional: Weights & Biases logging -``` - -## Executor Setup - -The `executor:` field in your YAML config is a name that maps to an entry in `$NEMORUN_HOME/executors.py`. This file must define a module-level `EXECUTOR_MAP` dictionary. NeMo-Run supports several executor types -- here are examples of the most common ones: - -### Slurm Executor - -```python -import nemo_run as run - -def my_slurm_cluster(): - executor = run.SlurmExecutor( - account="my_account", - partition="batch", - tunnel=run.SSHTunnel( - user="myuser", - host="login-node.example.com", - job_dir="/remote/path/nemo_run/jobs", - ), - nodes=1, - ntasks_per_node=8, - gpus_per_node=8, - mem="0", - exclusive=True, - packager=run.Packager(), - ) - executor.container_image = "nvcr.io/nvidia/nemo-automodel:26.02" - executor.container_mounts = ["/data:/data", "/checkpoints:/checkpoints"] - executor.env_vars = {"HF_HOME": "/data/hf_cache"} - executor.time = "04:00:00" - return executor - -EXECUTOR_MAP = { - "my_slurm": my_slurm_cluster(), -} -``` - -### Kubernetes Executor - -```python -import nemo_run as run - -def my_k8s_cluster(): - return run.KubeflowExecutor( - namespace="training", - image="nvcr.io/nvidia/nemo-automodel:26.02", - num_nodes=1, - nprocs_per_node=8, - gpus_per_node=8, - ) - -EXECUTOR_MAP = { - "my_k8s": my_k8s_cluster(), -} -``` - -### Multiple Executors - -You can define as many executors as you need for different backends, clusters, or resource configurations: - -```python -EXECUTOR_MAP = { - "slurm_dev": my_slurm_dev(), - "slurm_prod": my_slurm_prod(), - "k8s": my_k8s_cluster(), -} -``` - -- Keys in `EXECUTOR_MAP` are names you reference in YAML (`executor: slurm_dev`). -- Values can be executor instances or zero-argument callables that return one. -- Override fields in the YAML (`nodes`, `devices`, `container_image`, etc.) are applied on top of the executor defaults. - -## Quickstart - -Any existing AutoModel YAML config can be run via NeMo-Run by adding a `nemo_run:` section at the top. For example, given an existing config that you run locally: - -```bash -automodel examples/llm_finetune/qwen/qwen3_moe_30b_te_packed_sequence.yaml -``` - -Add a `nemo_run:` block to submit it to a remote executor instead: - -```yaml -# -- Add this section to any existing config ---------------------------------- -nemo_run: - executor: my_slurm # Name from EXECUTOR_MAP in $NEMORUN_HOME/executors.py - container_image: /images/custom.sqsh # Override executor's default image - nodes: 1 # Override number of nodes - ntasks_per_node: 8 # GPUs per node - time: "04:00:00" # Override time limit - job_name: qwen3_moe_finetune # Experiment and job name - -# -- Everything below is your existing training config (unchanged) ------------ -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 32 - # ... rest of your config ... -``` - -Then run the same command: - -```bash -automodel your_config.yaml -``` - -The CLI detects the `nemo_run:` key, strips it from the training config, loads the named executor from `$NEMORUN_HOME/executors.py`, and submits the job -- all in one command. - -## Configuration Reference - -### All `nemo_run:` Fields - -| Field | Default | Description | -|---|---|---| -| `executor` | `"local"` | Name from `EXECUTOR_MAP` in `$NEMORUN_HOME/executors.py`, or `"local"` for local execution | -| `job_name` | `` | Experiment and job name | -| `detach` | `true` | Return immediately after submission | -| `tail_logs` | `false` | Stream logs after submission | -| `executors_file` | `$NEMORUN_HOME/executors.py` | Path to the executor definitions file | -| `job_dir` | `./nemo_run_jobs` | Local directory for job artifacts (config snapshot) | -| *(any other key)* | *(from executor)* | Applied directly to the executor via `setattr`. Use the executor's native attribute names (e.g. `nodes`, `ntasks_per_node`, `partition`, `container_image`, `time`, `env_vars`). Dicts are merged, lists are extended. | - -## Examples - -### Single-Node Fine-Tuning (1 x 8 GPUs) - -```yaml -nemo_run: - executor: my_slurm - nodes: 1 - ntasks_per_node: 8 - job_name: single_node_finetune -``` - -### Multi-Node Distributed Training (2 x 8 GPUs) - -```yaml -nemo_run: - executor: my_slurm - nodes: 2 - ntasks_per_node: 8 - time: "08:00:00" - job_name: multinode_pretrain -``` - -For multi-node jobs the launcher automatically adds `--nnodes`, `--node-rank`, `--rdzv-backend`, `--master-addr`, and `--master-port` to the `torchrun` command. - -### Custom Container Image and Mounts - -```yaml -nemo_run: - executor: my_slurm - container_image: /images/automodel_nightly.sqsh - container_mounts: - - /scratch/datasets:/datasets - - /scratch/checkpoints:/checkpoints - env_vars: - HF_HOME: /datasets/hf_cache - NCCL_DEBUG: INFO -``` - -### Local Execution (No Cluster) - -Use `executor: local` to run on the current machine. No `$NEMORUN_HOME/executors.py` entry is needed: - -```yaml -nemo_run: - executor: local - ntasks_per_node: 2 - job_name: local_test -``` - -## Monitor and Manage Jobs - -NeMo-Run stores experiment metadata under `$NEMORUN_HOME/experiments/`. Set `tail_logs: true` in the YAML to stream job output after submission. - -For Slurm-based executors, standard Slurm commands also work: - -```bash -squeue -u $USER # List your queued and running jobs -scancel # Cancel a running or pending job -sacct -j # View job accounting information -``` - -For Kubernetes-based executors, use `kubectl` to monitor pods and jobs. - -## How It Works - -1. The `automodel` CLI detects the `nemo_run:` key and imports `NemoRunLauncher`. -2. The `nemo_run:` section is popped from the config. The remaining training config is written to `nemo_run_jobs//job_config.yaml` for record-keeping. -3. The launcher loads a pre-configured executor from `$NEMORUN_HOME/executors.py` by name (or creates a `LocalExecutor` for `executor: local`). Override fields are applied on top of the executor defaults. -4. The training config YAML is embedded in a self-contained inline bash script via a heredoc, so no separate file transfer is needed. -5. A `torchrun` command is built with `--nproc-per-node` and (for multi-node) distributed rendezvous arguments. -6. The script is submitted via `nemo_run.Experiment`. By default the call returns immediately (`detach=True`). - -## Customize Configuration - -Override any training parameter from the command line, same as with local runs: - -```bash -automodel config_with_nemo_run.yaml \ - --model.pretrained_model_name_or_path meta-llama/Llama-3.2-3B -``` - -## When to Use NeMo-Run vs. SkyPilot vs. Slurm - -| | NeMo-Run | SkyPilot | Slurm (sbatch) | -|---|---|---|---| -| **Infrastructure** | Slurm, Kubernetes, Docker, local | Public cloud (AWS, GCP, Azure) | On-prem HPC | -| **Container support** | Yes (Pyxis/Enroot, Docker, K8s pods) | N/A (cloud VMs) | Manual (in sbatch script) | -| **Setup required** | `nemo-run` + `$NEMORUN_HOME/executors.py` | Cloud credentials + `sky check` | Cluster access + sbatch script | -| **Job submission** | `automodel config.yaml` | `automodel config.yaml` | `sbatch slurm.sub` | -| **Good for** | Managed multi-backend execution, reusable executor configs | Cloud burst, cost optimization, spot instances | Direct Slurm scripts, full control over sbatch | - -``` - -File: /Users/mromeijn/src/Automodel/docs/launcher/skypilot.md -```md -# Run on Any Cloud with SkyPilot - -In this guide, you will learn how to launch NeMo AutoModel training jobs on any major cloud provider (AWS, GCP, Azure, Lambda, Kubernetes) using [SkyPilot](https://skypilot.readthedocs.io). For on-premises cluster usage, see [Run on a Cluster (Slurm)](./slurm.md). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md). - -SkyPilot is an open-source framework that abstracts cloud infrastructure so you can train on whichever cloud is cheapest or most available at launch time — including automatic spot-instance handling for significant cost savings. - -## Before You Begin - -Complete the following setup steps before launching your first AutoModel job on a cloud provider. - -1. **Install SkyPilot** with the connector for your target cloud: - -```bash -pip install "skypilot[gcp]" # Google Cloud -pip install "skypilot[aws]" # Amazon Web Services -pip install "skypilot[azure]" # Microsoft Azure -pip install "skypilot[lambda]" # Lambda Cloud -pip install "skypilot[kubernetes]" # Any Kubernetes cluster -``` - -2. **Configure your cloud credentials** by following the SkyPilot credential setup guide for your cloud, then verify: - -```bash -sky check -``` - -You should see at least one cloud listed as **OK**. - -3. **Set required environment variables:** - -```bash -export HF_TOKEN=hf_... # Required for gated models (e.g. Llama) -export WANDB_API_KEY=... # Optional: Weights & Biases logging -``` - -## Quickstart - -Add a `skypilot:` section to any existing config YAML, then run the same `automodel` command you already know: - -```bash -automodel finetune llm -c your_config_with_skypilot.yaml -``` - -The CLI detects the `skypilot:` key, strips it from the training config, uploads the code and config to a cloud VM, and launches training — all in one command. - -## Configuration Reference - -Below is an annotated example for fine-tuning Llama-3.2-1B on SQuAD on a GCP spot T4. A ready-to-run copy lives at [`examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml`](../../examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml). - -```yaml -# ── SkyPilot launcher section ───────────────────────────────────────────── -# Removed before the training config reaches the remote VM. -skypilot: - cloud: gcp # aws | gcp | azure | lambda | kubernetes - accelerators: T4:1 # GPU type:count per node, e.g. A100:8 - use_spot: true # ~80 % cost reduction vs on-demand - disk_size: 100 # Remote VM disk size in GB - num_nodes: 1 # Increase for multi-node distributed training - region: us-central1 # Optional — SkyPilot picks cheapest if omitted - job_name: llama3_2_finetune # Also used as the SkyPilot cluster name - - # Use env-var placeholders so secrets are never stored in YAML - hf_token: ${HF_TOKEN} - # wandb_key: ${WANDB_API_KEY} - - # Optional: extra shell commands run on the VM after `pip install -e .` - # setup: | - # pip install some-extra-dependency - - # Optional: override the default output directory (default: ./skypilot_jobs) - # job_dir: /path/to/skypilot/jobs - -# ── Training config (forwarded to the VM unchanged) ─────────────────────── -step_scheduler: - global_batch_size: 64 - local_batch_size: 8 - num_epochs: 1 - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -# ... rest of your training config ... -``` - -### All `skypilot:` Fields - -| Field | Default | Description | -|---|---|---| -| `cloud` | *(required)* | Cloud provider: `aws`, `gcp`, `azure`, `lambda`, `kubernetes` | -| `accelerators` | `T4:1` | GPU type and count per node, e.g. `A100:8`, `V100:4` | -| `num_nodes` | `1` | Number of VMs for distributed training | -| `use_spot` | `true` | Use spot/preemptible instances | -| `disk_size` | `100` | Remote VM disk size in GB | -| `region` | *(auto)* | Cloud region; SkyPilot selects cheapest if omitted | -| `zone` | *(auto)* | Availability zone within the region | -| `instance_type` | *(auto)* | Specific instance type; auto-selected if omitted | -| `job_name` | `_` | Job and SkyPilot cluster name | -| `setup` | *(auto)* | Extra setup commands run after `pip install -e .` | -| `hf_home` | `~/.cache/huggingface` | Hugging Face cache directory on the remote VM | -| `hf_token` | `$HF_TOKEN` env | Hugging Face token for gated model access | -| `wandb_key` | `$WANDB_API_KEY` env | Weights & Biases API key | -| `env_vars` | `{}` | Additional environment variables for the remote VM | -| `job_dir` | `./skypilot_jobs` | Local directory for job artifacts (config snapshot, logs) | -| `gpus_per_node` | *(parsed from `accelerators`)* | Override GPU count per node passed to `torchrun` | - -## Cloud Examples - -### AWS — On-Demand A10G - -```yaml -skypilot: - cloud: aws - accelerators: A10G:1 - use_spot: false - region: us-east-1 - job_name: llm_aws_finetune - hf_token: ${HF_TOKEN} -``` - -### GCP — spot V100, 8 GPUs (single node) - -```yaml -skypilot: - cloud: gcp - accelerators: V100:8 - use_spot: true - region: us-west1 - job_name: llm_gcp_v100_8gpu - hf_token: ${HF_TOKEN} -``` - -### Multi-node distributed training (2 × 8 × A100) - -```yaml -skypilot: - cloud: gcp - accelerators: A100:8 - num_nodes: 2 - use_spot: false - job_name: llm_multinode_a100 - hf_token: ${HF_TOKEN} -``` - -For multi-node jobs the launcher automatically adds the SkyPilot rendezvous environment variables (`$SKYPILOT_NODE_RANK`, `$SKYPILOT_NUM_NODES`, `$SKYPILOT_NODE_IPS`) to the `torchrun` command. - -## Monitor and Manage Jobs - -After submitting, use standard SkyPilot commands: - -```bash -sky status # List running clusters and their status -sky logs # Stream training logs -sky ssh # SSH into the VM for debugging -sky cancel # Cancel a running job -sky down # Terminate the cluster and stop billing -``` - -## How It Works - -1. The `automodel` CLI detects the `skypilot:` key in the YAML and calls `launch_with_skypilot()`. -2. The training config (with `skypilot:` removed) is written to a local `skypilot_jobs//job_config.yaml`. -3. A `sky.Task` is created with: - - **workdir** — the current directory synced to `~/sky_workdir` on the remote VM. - - **file_mounts** — the job config uploaded to `/tmp/automodel_job_config.yaml`. - - **setup** — `pip install -e .` (plus any custom `setup:` commands). - - **run** — a `torchrun` command pointing at the recipe script and config. -4. `sky.launch()` provisions the VM, runs setup, then executes training. The call returns immediately (`detach_run=True`); use `sky logs` to follow progress. - -## Customize Configuration - -Override any training parameter from the command line, same as with local runs: - -```bash -automodel finetune llm -c config_with_skypilot.yaml \ - --model.pretrained_model_name_or_path meta-llama/Llama-3.2-3B -``` - -## When to Use SkyPilot vs. Slurm - -| | SkyPilot | Slurm | -|---|---|---| -| **Infrastructure** | Any public cloud | On-premises HPC cluster | -| **Spot instances** | Yes (automatic) | Depends on cluster config | -| **Setup required** | Cloud credentials + `sky check` | Cluster access | -| **Good for** | Flexible cloud burst, cost optimization | Fixed on-prem GPU clusters | - -``` - -File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -```yaml -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - - -# To run this recipe: -# automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml --nproc-per-node 8 -# Adjust --nproc-per-node to the number of GPUs available on your machine. - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 64 - local_batch_size: 8 - ckpt_every_steps: 1000 - val_every_steps: 10 # will run every x number of gradient steps - num_epochs: 1 - -dist_env: - backend: nccl - timeout_minutes: 1 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1111 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -# torch.compile configuration -compile: - enabled: false - mode: "default" # Options: "default", "reduce-overhead", "max-autotune" - fullgraph: false - dynamic: true # Set to false for better performance with fixed shapes - backend: null # Use default backend (inductor) - -clip_grad_norm: - max_norm: 1.0 - -distributed: - strategy: fsdp2 - dp_size: none - tp_size: 1 - cp_size: 1 - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: train - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - shuffle: false - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation - limit_dataset_samples: 64 - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - -optimizer: - _target_: torch.optim.Adam - betas: [0.9, 0.999] - eps: 1e-8 - lr: 1.0e-5 - weight_decay: 0 - # min_lr: 1.0e-5 - -lr_scheduler: - lr_decay_style: cosine - min_lr: 1.0e-6 - -# Uncomment and configure for W&B logging -# wandb: -# project: -# entity: -# name: -# save_dir: - -# Uncomment and configure for Mlflow logging -# mlflow: -# experiment_name: "automodel-llm-llama3_2_1b_squad-finetune" -# run_name: "" -# tracking_uri: null -# artifact_location: null -# tags: -# task: "squad-finetune" -# model_family: "llama3.2" -# model_size: "1b" -# dataset: "squad" -# framework: "automodel" - -ci: - recipe_owner: akoumpa - -``` - -File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml -```yaml -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Fine-tune Llama-3.2-1B on SQuAD using SkyPilot for cloud execution. -# -# Usage: -# automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml -# -# Prerequisites: -# pip install "skypilot[gcp]" # or [aws], [azure], etc. -# sky check # verify cloud credentials -# -# Monitor: -# sky status -# sky logs - -# --------------------------------------------------------------------------- -# SkyPilot launcher config (removed before the job config reaches the VM) -# --------------------------------------------------------------------------- -skypilot: - cloud: gcp # aws | gcp | azure | lambda | kubernetes - accelerators: T4:1 # GPU type:count per node - use_spot: true # ~80 % cost reduction vs on-demand - disk_size: 100 # GB - num_nodes: 1 - region: us-central1 # optional; SkyPilot picks cheapest if omitted - job_name: llama3_2_1b_squad - - # Credentials – use env-var placeholders so secrets are never stored in YAML. - hf_token: ${HF_TOKEN} - # wandb_key: ${WANDB_API_KEY} - - # Extra setup commands run on the VM after `pip install -e .` - # setup: | - # pip install some-extra-dependency - -# --------------------------------------------------------------------------- -# Training config (forwarded to the VM unchanged) -# --------------------------------------------------------------------------- -recipe: - _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 64 - local_batch_size: 8 - ckpt_every_steps: 1000 - val_every_steps: 10 - num_epochs: 1 - -dist_env: - backend: nccl - timeout_minutes: 1 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1111 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -compile: - enabled: false - mode: "default" - fullgraph: false - dynamic: true - backend: null - -clip_grad_norm: - max_norm: 1.0 - -distributed: - strategy: fsdp2 - dp_size: none - tp_size: 1 - cp_size: 1 - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: train - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - shuffle: false - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation - limit_dataset_samples: 64 - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - -optimizer: - _target_: torch.optim.Adam - betas: [0.9, 0.999] - eps: 1e-8 - lr: 1.0e-5 - weight_decay: 0 - -lr_scheduler: - lr_decay_style: cosine - min_lr: 1.0e-6 - -ci: - recipe_owner: adil-a - -``` - diff --git a/skills/nemotron-customize/context/automodel-sft-peft-core.txt b/skills/nemotron-customize/context/automodel-sft-peft-core.txt deleted file mode 100644 index 5e2f02727..000000000 --- a/skills/nemotron-customize/context/automodel-sft-peft-core.txt +++ /dev/null @@ -1,6696 +0,0 @@ - -/Users/mromeijn/src/Automodel -├── docs -│ ├── guides -│ │ ├── llm -│ │ │ ├── dataset.md * -│ │ │ └── finetune.md * -│ │ ├── diffusion -│ │ ├── omni -│ │ ├── vlm -│ │ ├── checkpointing.md * -│ │ └── dataset-overview.md * -│ ├── about -│ ├── launcher -│ └── model-coverage -│ ├── diffusion -│ │ ├── black-forest-labs -│ │ ├── hunyuanvideo-community -│ │ └── wan-ai -│ ├── llm -│ │ ├── allenai -│ │ ├── baai -│ │ ├── baichuan-inc -│ │ ├── bigcode -│ │ ├── bytedance-seed -│ │ ├── cohere -│ │ ├── deepseek-ai -│ │ ├── eleutherai -│ │ ├── google -│ │ ├── ibm -│ │ ├── inceptionai -│ │ ├── internlm -│ │ ├── lgai-exaone -│ │ ├── meta -│ │ ├── microsoft -│ │ ├── minimax -│ │ ├── mistralai -│ │ ├── moonshotai -│ │ ├── nvidia -│ │ ├── openai -│ │ ├── openbmb -│ │ ├── orionstar -│ │ ├── parasail-ai -│ │ ├── qwen -│ │ ├── stabilityai -│ │ ├── stepfun-ai -│ │ ├── thudm -│ │ ├── tiiuae -│ │ └── upstage -│ ├── omni -│ │ ├── microsoft -│ │ └── qwen -│ └── vlm -│ ├── google -│ ├── huggingface -│ ├── internlm -│ ├── llava-hf -│ ├── meta -│ ├── mistralai -│ ├── moonshotai -│ ├── nvidia -│ └── qwen -├── examples -│ ├── llm_finetune -│ │ ├── llama3_1 -│ │ │ └── llama3_1_8b_columnmapped_lora.yaml * -│ │ ├── llama3_2 -│ │ │ ├── llama3_2_1b_squad.yaml * -│ │ │ └── llama3_2_1b_squad_peft.yaml * -│ │ ├── baichuan -│ │ ├── cohere -│ │ ├── deepseek_v32 -│ │ ├── devstral -│ │ ├── falcon -│ │ ├── gemma -│ │ ├── glm -│ │ ├── gpt_oss -│ │ ├── granite -│ │ ├── llama3_3 -│ │ ├── minimax_m2 -│ │ ├── mistral -│ │ ├── moonlight -│ │ ├── nemotron -│ │ ├── nemotron_flash -│ │ ├── olmo -│ │ ├── phi -│ │ ├── qwen -│ │ ├── seed -│ │ ├── starcoder -│ │ └── stepfun -│ ├── convergence -│ │ └── tulu3 -│ │ ├── data -│ │ ├── eval -│ │ ├── inference -│ │ ├── model-verification -│ │ ├── models -│ │ │ ├── gpt-oss-20b -│ │ │ │ └── assets -│ │ │ ├── moonlight-16b -│ │ │ │ └── assets -│ │ │ ├── qwen3-4b -│ │ │ │ └── assets -│ │ │ └── qwen3-moe-30b -│ │ │ ├── assets -│ │ │ └── experiments -│ │ └── training -│ ├── diffusion -│ │ ├── finetune -│ │ ├── generate -│ │ │ └── configs -│ │ └── pretrain -│ ├── dllm_generate -│ ├── dllm_sft -│ ├── llm_benchmark -│ │ ├── deepseek -│ │ ├── glm -│ │ ├── gpt_oss -│ │ ├── kimi -│ │ ├── llama3_3 -│ │ ├── minimax -│ │ ├── mistral -│ │ ├── moonlight -│ │ ├── nemotron -│ │ ├── qwen -│ │ └── step -│ ├── llm_kd -│ │ └── llama3_2 -│ ├── llm_pretrain -│ ├── llm_seq_cls -│ │ └── glue -│ ├── retrieval -│ │ ├── bi_encoder -│ │ │ └── llama_embed_nemotron_8b -│ │ ├── cross_encoder -│ │ └── data_utils -│ ├── vlm_benchmark -│ │ ├── kimi -│ │ ├── mistral -│ │ └── qwen -│ ├── vlm_finetune -│ │ ├── gemma3 -│ │ ├── gemma3n -│ │ ├── gemma4 -│ │ ├── internvl -│ │ ├── kimi -│ │ ├── mistral -│ │ ├── mistral4 -│ │ ├── nemotron -│ │ ├── phi4 -│ │ ├── qwen2_5 -│ │ ├── qwen3 -│ │ ├── qwen3_5 -│ │ └── qwen3_5_moe -│ └── vlm_generate -├── nemo_automodel -│ ├── components -│ │ ├── _peft -│ │ │ ├── lora.py * + -│ │ │ └── module_matcher.py * + -│ │ ├── datasets -│ │ │ ├── llm -│ │ │ │ ├── megatron -│ │ │ │ ├── chat_dataset.py * + -│ │ │ │ └── column_mapped_text_instruction_dataset.py * + -│ │ │ ├── diffusion -│ │ │ ├── dllm -│ │ │ ├── vlm -│ │ │ └── utils.py * + -│ │ ├── attention -│ │ ├── checkpoint -│ │ │ └── _backports -│ │ ├── config -│ │ ├── distributed -│ │ │ └── pipelining -│ │ ├── flow_matching -│ │ │ └── adapters -│ │ ├── launcher -│ │ │ ├── nemo_run -│ │ │ └── skypilot -│ │ ├── loggers -│ │ ├── loss -│ │ │ └── triton -│ │ ├── models -│ │ │ ├── baichuan -│ │ │ ├── common -│ │ │ ├── deepseek_v3 -│ │ │ ├── deepseek_v32 -│ │ │ ├── gemma4_moe -│ │ │ ├── glm4_moe -│ │ │ ├── glm4_moe_lite -│ │ │ ├── glm_moe_dsa -│ │ │ ├── gpt_oss -│ │ │ ├── kimi_k25_vl -│ │ │ ├── kimivl -│ │ │ ├── llama -│ │ │ ├── llama_bidirectional -│ │ │ ├── minimax_m2 -│ │ │ ├── mistral3 -│ │ │ ├── mistral4 -│ │ │ ├── nemotron_parse -│ │ │ ├── nemotron_v3 -│ │ │ ├── qwen2 -│ │ │ ├── qwen3_5_moe -│ │ │ ├── qwen3_moe -│ │ │ ├── qwen3_next -│ │ │ ├── qwen3_omni_moe -│ │ │ ├── qwen3_vl_moe -│ │ │ └── step3p5 -│ │ ├── moe -│ │ │ ├── megatron -│ │ │ └── uccl_ep -│ │ ├── optim -│ │ ├── quantization -│ │ ├── training -│ │ └── utils -│ ├── recipes -│ │ ├── llm -│ │ │ └── train_ft.py * + -│ │ ├── diffusion -│ │ ├── dllm -│ │ ├── retrieval -│ │ └── vlm -│ ├── _diffusers -│ ├── _transformers -│ │ └── tokenization -│ ├── autonvtx -│ ├── cli -│ └── shared -├── .github -│ ├── actions -│ │ ├── build-container -│ │ └── test-template -│ └── workflows -│ └── config -├── docker -│ └── common -├── scripts -├── skills -│ ├── .claude -│ │ └── skills -│ │ ├── developer-guide -│ │ ├── distributed-training -│ │ ├── launcher-config -│ │ ├── model-onboarding -│ │ ├── parity-testing -│ │ └── recipe-development -│ ├── developer-guide -│ ├── distributed-training -│ ├── launcher-config -│ ├── model-onboarding -│ ├── parity-testing -│ └── recipe-development -├── tests -│ ├── ci_tests -│ │ ├── configs -│ │ │ ├── llm_benchmark -│ │ │ ├── llm_finetune -│ │ │ ├── vlm_benchmark -│ │ │ └── vlm_finetune -│ │ ├── golden_values -│ │ │ ├── llm_finetune -│ │ │ │ ├── baichuan -│ │ │ │ ├── falcon -│ │ │ │ ├── gemma -│ │ │ │ ├── glm -│ │ │ │ ├── gpt_oss -│ │ │ │ ├── granite -│ │ │ │ ├── llama3_1 -│ │ │ │ ├── llama3_2 -│ │ │ │ ├── mistral -│ │ │ │ ├── moonlight -│ │ │ │ ├── nemotron -│ │ │ │ ├── nemotron_flash -│ │ │ │ ├── olmo -│ │ │ │ ├── phi -│ │ │ │ ├── qwen -│ │ │ │ ├── seed -│ │ │ │ └── starcoder -│ │ │ └── vlm_finetune -│ │ │ ├── gemma3 -│ │ │ ├── gemma3n -│ │ │ ├── internvl -│ │ │ ├── mistral -│ │ │ ├── nemotron -│ │ │ ├── qwen2_5 -│ │ │ ├── qwen3 -│ │ │ └── qwen3_5_moe -│ │ ├── scripts -│ │ └── utils -│ ├── functional_tests -│ │ ├── checkpoint -│ │ ├── checkpoint_robustness -│ │ ├── context_parallel -│ │ ├── data -│ │ │ └── llm -│ │ ├── datasets -│ │ │ └── llm -│ │ ├── hf_dcp -│ │ ├── hf_peft -│ │ ├── hf_transformer -│ │ ├── hf_transformer_finetune -│ │ ├── hf_transformer_llm -│ │ ├── hf_transformer_vlm -│ │ ├── llm_pretrain_and_kd -│ │ │ ├── customizer_retrieval -│ │ │ ├── llm_seq_cls -│ │ │ └── loss -│ │ ├── retrieval -│ │ └── training -│ ├── unit_tests -│ │ ├── _cli -│ │ ├── _diffusers -│ │ ├── _peft -│ │ ├── _transformers -│ │ ├── attention -│ │ ├── checkpoint -│ │ ├── components -│ │ │ └── training -│ │ ├── config -│ │ ├── datasets -│ │ │ ├── diffusion -│ │ │ ├── dllm -│ │ │ ├── llm -│ │ │ └── vlm -│ │ ├── diffusion_processors -│ │ ├── distributed -│ │ │ └── pipelining -│ │ ├── flow_matching -│ │ │ └── adapters -│ │ ├── launcher -│ │ ├── loggers -│ │ ├── loss -│ │ ├── models -│ │ │ ├── baichuan -│ │ │ ├── bi_encoder -│ │ │ ├── common -│ │ │ ├── deepseek_v3 -│ │ │ ├── deepseek_v32 -│ │ │ ├── gemma4 -│ │ │ ├── glm4_moe -│ │ │ ├── glm4_moe_lite -│ │ │ ├── glm_moe_dsa -│ │ │ ├── gpt_oss -│ │ │ ├── kimi_k25_vl -│ │ │ ├── kimivl -│ │ │ ├── llama -│ │ │ ├── minimax_m2 -│ │ │ ├── mistral3 -│ │ │ ├── mistral4 -│ │ │ ├── nemotron_parse -│ │ │ ├── nemotron_v3 -│ │ │ ├── qwen2 -│ │ │ ├── qwen3_5 -│ │ │ ├── qwen3_5_moe -│ │ │ ├── qwen3_moe -│ │ │ ├── qwen3_next -│ │ │ ├── qwen3_omni_moe -│ │ │ ├── qwen3_vl_moe -│ │ │ └── step3p5 -│ │ ├── moe -│ │ ├── optim -│ │ ├── quantization -│ │ ├── recipes -│ │ │ ├── dllm -│ │ │ └── llm -│ │ ├── shared -│ │ ├── tools -│ │ ├── training -│ │ └── utils -│ └── utils -├── tools -│ └── diffusion -│ ├── data -│ └── processors -└── tutorials - └── nemotron-parse - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; selected files shown. - - -File: /Users/mromeijn/src/Automodel/nemo_automodel/recipes/llm/train_ft.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import inspect -import logging -import pathlib -import time -from contextlib import nullcontext -from typing import TYPE_CHECKING, Any, Dict, Optional - -import torch -import torch.nn as nn -import wandb -from huggingface_hub import constants as hf_constants -from torch.utils.data import DataLoader, IterableDataset -from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp -from torchdata.stateful_dataloader.sampler import StatefulDistributedSampler -from transformers import AutoConfig -from transformers.tokenization_utils_base import PreTrainedTokenizerBase -from wandb import Settings - -from nemo_automodel._transformers import NeMoAutoModelForCausalLM, NeMoAutoModelForSequenceClassification -from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer -from nemo_automodel._transformers.infrastructure import ( - apply_model_infrastructure, - instantiate_infrastructure, -) -from nemo_automodel._transformers.mfu import AutoMFU -from nemo_automodel._transformers.utils import apply_cache_compatibility_patches -from nemo_automodel.components.checkpoint.checkpointing import ( - Checkpointer, - CheckpointingConfig, -) -from nemo_automodel.components.config._arg_parser import parse_args_and_load_config -from nemo_automodel.components.datasets.llm.megatron.sampler import create_megatron_sampler -from nemo_automodel.components.datasets.llm.megatron_dataset import MegatronPretraining -from nemo_automodel.components.datasets.llm.packed_sequence import pack_dataset -from nemo_automodel.components.distributed.config import MegatronFSDPConfig -from nemo_automodel.components.distributed.cp_utils import make_cp_batch_and_ctx -from nemo_automodel.components.distributed.init_utils import ( - initialize_distributed, -) -from nemo_automodel.components.distributed.megatron_fsdp import fully_shard_optimizer -from nemo_automodel.components.distributed.mesh import MeshContext -from nemo_automodel.components.distributed.pipelining import AutoPipeline -from nemo_automodel.components.distributed.utils import FirstRankPerNode, get_sync_ctx -from nemo_automodel.components.loggers.comet_utils import build_comet -from nemo_automodel.components.loggers.log_utils import setup_logging -from nemo_automodel.components.loggers.metric_logger import MetricsSample, build_metric_logger -from nemo_automodel.components.loggers.mlflow_utils import build_mlflow -from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages -from nemo_automodel.components.loss.linear_ce import FusedLinearCrossEntropy -from nemo_automodel.components.loss.masked_ce import MaskedCrossEntropy -from nemo_automodel.components.moe.megatron.moe_utils import MoEAuxLossAutoScaler -from nemo_automodel.components.optim.scheduler import OptimizerParamScheduler -from nemo_automodel.components.optim.utils import build_dion_optimizer, is_dion_optimizer -from nemo_automodel.components.quantization.fp8 import build_fp8_config -from nemo_automodel.components.training.model_output_utils import get_final_hidden_states -from nemo_automodel.components.training.rng import ScopedRNG, StatefulRNG -from nemo_automodel.components.training.step_scheduler import StepScheduler -from nemo_automodel.components.training.utils import ( - count_tail_padding, - prepare_after_first_microbatch, - prepare_for_final_backward, - prepare_for_grad_accumulation, - scale_grads_and_clip_grad_norm, -) -from nemo_automodel.components.utils.compile_utils import ( - build_compile_config, -) -from nemo_automodel.components.utils.flops_utils import calculate_mfu -from nemo_automodel.components.utils.model_utils import ( - _supports_logits_to_keep, - _supports_seq_lens, - filter_forward_kwargs, - resolve_trust_remote_code, -) -from nemo_automodel.recipes._dist_setup import setup_distributed -from nemo_automodel.recipes.base_recipe import BaseRecipe -from nemo_automodel.shared.te_patches import apply_te_patches -from nemo_automodel.shared.utils import dtype_from_str - -if TYPE_CHECKING: - from torch.optim import Optimizer - - from nemo_automodel.components.distributed.init_utils import DistInfo - -logger = logging.getLogger(__name__) - - -# --------------------------- -# Stateless helper functions -# --------------------------- -def _get_model_name(cfg_model): - if cfg_model.get("pretrained_model_name_or_path", None) is not None: - return cfg_model.pretrained_model_name_or_path - elif cfg_model.get("config", None) is not None: - if isinstance(cfg_model.config, str): - return cfg_model.config - return cfg_model.config.get("pretrained_model_name_or_path", None) - else: - return None - - -def _uses_te_dot_product_attention(model_or_cfg): - """Check whether the model uses TE DotProductAttention. - - Accepts either an instantiated nn.Module (preferred — inspects actual modules) - or a config object (fallback — checks backend.attn string). - """ - if isinstance(model_or_cfg, torch.nn.Module): - try: - from transformer_engine.pytorch.attention import DotProductAttention - except ImportError: - return False - return any(isinstance(m, DotProductAttention) for m in model_or_cfg.modules()) - # Config fallback for call sites before model is built - return ( - hasattr(model_or_cfg, "backend") and hasattr(model_or_cfg.backend, "attn") and model_or_cfg.backend.attn == "te" - ) - - -def _uses_thd_collater(cfg_dataloader): - from nemo_automodel.components.datasets.utils import packed_sequence_thd_collater - - return ( - True - if hasattr(cfg_dataloader, "collate_fn") and cfg_dataloader.collate_fn == packed_sequence_thd_collater - else False - ) - - -def _get_num_thd_chunks(pp_enabled, cfg): - if pp_enabled: - return cfg.step_scheduler.local_batch_size // cfg.get("distributed.pipeline.pp_microbatch_size", 1) - return 1 - - -def build_model( - cfg_model, - cfg_peft, - seed, - has_packed_sequence=False, - cfg_fp8=None, - cfg_compile=None, - cfg_quantization=None, - device_mesh=None, - moe_mesh=None, - distributed_config=None, - pipeline_config=None, - cfg_qat=None, - cfg_moe=None, - activation_checkpointing=False, - unfreeze_modules: list[str] | None = None, - sdpa_method: list[str] | None = None, -) -> tuple[nn.Module | AutoPipeline, list["Optimizer"]]: # noqa: F821 - """Build and initialize a model. - - Args: - cfg_model: Configuration for model instantiation. - cfg_peft: Configuration for PEFT. - seed: Random seed. - has_packed_sequence: Whether using packed sequences. - cfg_fp8: Configuration for FP8. - cfg_compile: Configuration for torch.compile. - cfg_quantization: Configuration for BitsAndBytes quantization. - device_mesh: Device mesh for distributed training. - moe_mesh: MOE mesh for expert parallelism. - distributed_config: Strategy-specific distributed config (FSDP2Config, etc.). - pipeline_config: Pipeline parallelism config. - cfg_qat: Configuration for QAT (will be instantiated to QATConfig). - cfg_moe: MoEParallelizerConfig instance, or ConfigNode to be converted. - activation_checkpointing: Whether to enable activation checkpointing. - unfreeze_modules: List of module names/substrings to unfreeze. - sdpa_method: Explicit list of SDPA backend name strings (e.g. - ``["flash_attention", "efficient_attention"]``), or ``None`` to - auto-select based on CP / activation checkpointing. - """ - with ScopedRNG(seed=seed, ranked=True): - kwargs = { - "has_packed_sequence": has_packed_sequence, - "peft_config": cfg_peft, - "device_mesh": device_mesh, - "moe_mesh": moe_mesh, - "distributed_config": distributed_config, - "pipeline_config": pipeline_config, - "sdpa_method": sdpa_method, - } - - if cfg_qat is not None and cfg_qat.get("enabled", False): - if cfg_peft is not None: - raise ValueError("QAT with PEFT is not currently supported") - qat_config_attr = getattr(cfg_qat, "qat_config", None) - if qat_config_attr is not None: - kwargs["qat_config"] = qat_config_attr.instantiate() - else: - # Fallback to legacy quantizer format for backward compatibility - quantizer_attr = getattr(cfg_qat, "quantizer", None) - if quantizer_attr is not None: - kwargs["qat_config"] = quantizer_attr.instantiate() - - if cfg_moe is not None: - from nemo_automodel.components.moe.config import MoEParallelizerConfig - - if isinstance(cfg_moe, MoEParallelizerConfig): - kwargs["moe_config"] = cfg_moe - else: - moe_dict = cfg_moe.to_dict() if hasattr(cfg_moe, "to_dict") else dict(cfg_moe) - # activation_checkpointing is handled separately; strip config keys - moe_dict.pop("activation_checkpointing", None) - moe_dict.pop("_target_", None) - kwargs["moe_config"] = MoEParallelizerConfig(**moe_dict) - kwargs["activation_checkpointing"] = activation_checkpointing - - if cfg_fp8 is not None: - kwargs["fp8_config"] = build_fp8_config(cfg_fp8) - if cfg_compile is not None: - kwargs["compile_config"] = build_compile_config(cfg_compile) - if cfg_quantization is not None: - logger.info("Model weight quantization enabled with BitsAndBytes") - from nemo_automodel.components.quantization.qlora import create_bnb_config - - kwargs["quantization_config"] = create_bnb_config(cfg_quantization) - - is_nemo_auto_model = cfg_model.get("_target_", None) in ( - NeMoAutoModelForCausalLM.from_config, - NeMoAutoModelForCausalLM.from_pretrained, - NeMoAutoModelForSequenceClassification.from_config, - NeMoAutoModelForSequenceClassification.from_pretrained, - ) - - if is_nemo_auto_model: - # NeMoAutoModel handles infrastructure internally - model = cfg_model.instantiate(**kwargs) - else: - # For non-NemoAutoModel entry points (e.g., build_gpt2_model), - # instantiate the model first, then apply infrastructure separately. - # Note: sdpa_method is not supported here — SDPA patching only runs - # inside NeMoAutoModel._build_model. - if sdpa_method is not None: - logger.warning("sdpa_method is ignored for non-NeMoAutoModel targets.") - # We must convert config objects into runtime objects (model_wrapper, - # autopipeline, parallelize_fn, etc.) via instantiate_infrastructure, - # exactly as from_pretrained/from_config do internally. - model = cfg_model.instantiate() - - mesh = MeshContext.from_meshes(device_mesh, moe_mesh) - model_wrapper, autopipeline, parallelize_fn, qat_quantizer = instantiate_infrastructure( - distributed_config=distributed_config, - pipeline_config=pipeline_config, - qat_config=kwargs.get("qat_config"), - moe_config=kwargs.get("moe_config"), - activation_checkpointing=kwargs.get("activation_checkpointing", False), - device=torch.device("cuda", torch.cuda.current_device()), - mesh=mesh, - ) - loss_fn = pipeline_config.loss_fn if pipeline_config is not None else None - - model = apply_model_infrastructure( - model, - is_meta_device=False, - device=torch.cuda.current_device(), - mesh=mesh, - model_wrapper=model_wrapper, - autopipeline=autopipeline, - parallelize_fn=parallelize_fn, - qat_quantizer=qat_quantizer, - loss_fn=loss_fn, - peft_config=kwargs.get("peft_config"), - fp8_config=kwargs.get("fp8_config"), - compile_config=kwargs.get("compile_config"), - quantization_config=kwargs.get("quantization_config"), - pretrained_model_name_or_path=None, - load_base_model=False, - cache_dir=hf_constants.HF_HUB_CACHE, - ) - - # Explicitly unfreeze specified modules (e.g. task heads) that need full fine-tuning - if unfreeze_modules: - for name, param in model.named_parameters(): - if any(module_name in name for module_name in unfreeze_modules): - param.requires_grad_(True) - logging.info(f"Unfroze parameters matching: {unfreeze_modules}") - - return model - - -def build_optimizer(model, cfg_opt, distributed_config, device_mesh): - """Build an optimizer for the model. - - Args: - model: The model to build an optimizer for. - cfg_opt: The configuration for the optimizer. - distributed_config: The distributed configuration. - device_mesh: The device mesh. - """ - # Resolve dtype strings (e.g. "torch.bfloat16") to torch.dtype objects for - # optimizers like TE FusedAdam that accept dtype kwargs. - for attr in ("master_weight_dtype", "exp_avg_dtype", "exp_avg_sq_dtype"): - val = getattr(cfg_opt, attr, None) - if isinstance(val, str): - setattr(cfg_opt, attr, dtype_from_str(val)) - - if device_mesh is not None and "tp" in device_mesh.mesh_dim_names and device_mesh["tp"].size() > 1: - # TP does not support foreach - cfg_opt.foreach = False - - optimizer = [] - has_dion_optimizer = is_dion_optimizer(cfg_opt) - for part in getattr(model, "parts", [model]): - trainable_params = list(filter(lambda x: x.requires_grad, part.parameters())) - assert len(trainable_params) > 0, "trainable_params cannot be empty" - # TODO(@akoumparouli): no branching for building the optimizer, refactor. - if has_dion_optimizer: - tmp_optimizer = build_dion_optimizer( - cfg_opt=cfg_opt, - model=part, - distributed_mesh=device_mesh, - ) - else: - tmp_optimizer = cfg_opt.instantiate(params=trainable_params) - if isinstance(distributed_config, MegatronFSDPConfig) and torch.distributed.get_world_size() > 1: - assert not has_dion_optimizer, "Dion optimizer does not support fully_shard_optimizer" - tmp_optimizer = fully_shard_optimizer(part, tmp_optimizer) - optimizer.append(tmp_optimizer) - - return optimizer - - -def build_checkpoint_config(cfg_ckpt, cache_dir, model_repo_id, is_peft) -> CheckpointingConfig: - """Build a checkpoint configuration. - - Args: - cfg_ckpt: Configuration for checkpointing. - cache_dir: Cache directory for the model. - model_repo_id: Model repository ID. - is_peft: Whether the model is PEFT. - state_dict_keys: Copy of the model state dict keys before any parallelization. - - Returns: - The instantiated checkpoint configuration. - """ - - ckpt_kwargs = dict( - enabled=True, - checkpoint_dir="checkpoints/", - model_save_format="safetensors", - model_repo_id=model_repo_id, - model_cache_dir=cache_dir if cache_dir is not None else hf_constants.HF_HUB_CACHE, - save_consolidated=True, - is_peft=is_peft, - ) - if cfg_ckpt is not None: - cfg_ckpt = cfg_ckpt.to_dict() - cfg_ckpt.pop("restore_from", None) - ckpt_kwargs |= cfg_ckpt - if ckpt_kwargs.get("is_peft", False) and ckpt_kwargs.get("model_save_format") == "torch_save": - raise ValueError( - "PEFT checkpointing is not supported for torch_save format. Save using `safetensors` format instead." - ) - checkpoint_config = CheckpointingConfig(**ckpt_kwargs) - return checkpoint_config - - -def build_loss_fn(cfg_loss): - """Build a loss function. - - Args: - cfg_loss (ConfigNode): Loss function configuration. - - Returns: - The instantiated loss function on the specified device. - """ - return cfg_loss.instantiate() - - -def compute_trust_remote_code_from_model(cfg_model): - """Compute the value of trust_remote_code based on the model configuration. - - Args: - cfg_model (ConfigNode): Model configuration. - - Returns: - bool: Whether to trust remote code. - """ - if hasattr(cfg_model, "trust_remote_code"): - return getattr(cfg_model, "trust_remote_code") - elif hasattr(cfg_model, "config") and hasattr(cfg_model.config, "trust_remote_code"): - return getattr(cfg_model.config, "trust_remote_code") - return resolve_trust_remote_code(_get_model_name(cfg_model)) - - -def _build_tokenizer(cfg_model, cfg_ds): - trust_remote_code = compute_trust_remote_code_from_model(cfg_model) - # if tokenizer is not provided, use the model config to instantiate it - if "tokenizer" not in cfg_ds and _get_model_name(cfg_model) is not None: - logging.info("Using model config to instantiate tokenizer") - tokenizer = NeMoAutoTokenizer.from_pretrained(_get_model_name(cfg_model), trust_remote_code=trust_remote_code) - elif cfg_ds.get("tokenizer", None) is None: - tokenizer = None - elif "_target_" not in cfg_ds.tokenizer: - tokenizer_dict = cfg_ds.tokenizer.to_dict() - trust_remote_code = tokenizer_dict.pop("trust_remote_code", trust_remote_code) - tokenizer = NeMoAutoTokenizer.from_pretrained(**tokenizer_dict, trust_remote_code=trust_remote_code) - else: - trust_remote_code = cfg_ds.tokenizer.to_dict().pop("trust_remote_code", trust_remote_code) - tokenizer = cfg_ds.tokenizer.instantiate(trust_remote_code=trust_remote_code) - - # Finally, check if the dataset target accepts a tokenizer parameter - kwargs = {} - if tokenizer is not None and callable(cfg_ds._target_): - try: - sig = inspect.signature(cfg_ds._target_) - if "tokenizer" in sig.parameters: - kwargs["tokenizer"] = tokenizer - except (ValueError, TypeError): - # If we can't get the signature, skip adding tokenizer - pass - return kwargs, tokenizer - - -def build_dataloader( - cfg_ds, - cfg_dl, - cfg_model, - cfg_ps, - seed, - local_batch_size, - global_batch_size, - max_steps, - val_check_interval, - dp_rank, - dp_world_size, - pp_enabled, - cp_size=1, - model: Optional[nn.Module] = None, -) -> tuple[DataLoader, PreTrainedTokenizerBase]: - """Build a DataLoader for the dataset. - - Args: - cfg_ds: Dataset configuration. - cfg_dl: DataLoader configuration. - cfg_model: Model configuration. - cfg_ps: Packed sequence configuration. - seed: Random seed. - local_batch_size: Local batch size. - global_batch_size: Global batch size. - max_steps: Maximum number of steps. - val_check_interval: Validation check interval. - dp_rank: Data parallel rank. - dp_world_size: Data parallel world size. - pp_enabled: Whether pipeline parallelism is enabled. - cp_size: Context parallel size. - model: Optional model instance. If provided and packed sequences are enabled, - seq_lens will only be included if the model's forward() accepts it. - Returns: - The instantiated DataLoader and tokenizer. - """ - with ScopedRNG(seed=seed, ranked=True): - kwargs, tokenizer = _build_tokenizer(cfg_model, cfg_ds) - # Megatron specific kwargs - if cfg_ds._target_ == MegatronPretraining: - kwargs["global_batch_size"] = global_batch_size - kwargs["trainer_max_steps"] = max_steps if max_steps is not None else None - kwargs["trainer_val_check_interval"] = val_check_interval - ds = cfg_ds.instantiate(**kwargs) - ds.build() - else: - with FirstRankPerNode(): - ds = cfg_ds.instantiate(**kwargs) - - # If using an IterableDataset, per-rank sharding for unique samples - if isinstance(ds, IterableDataset): - if callable(getattr(ds, "shard", None)): - ds = ds.shard(dp_world_size, dp_rank) - logging.info(f"Sharded IterableDataset via dataset.shard: world_size={dp_world_size}, rank={dp_rank}") - elif hasattr(ds, "dataset"): - # HuggingFace streaming datasets: split by file shards when possible. - from datasets.distributed import split_dataset_by_node - - assert hasattr(ds, "dataset"), "dataset must have a dataset attribute" - ds.dataset = split_dataset_by_node(ds.dataset, world_size=dp_world_size, rank=dp_rank) - logging.info(f"Sharded dataset via split_dataset_by_node: world_size={dp_world_size}") - else: - logging.warning("IterableDataset does not support sharding; Data may be duplicated across ranks.") - - packed_sequence_size = getattr(cfg_ps, "packed_sequence_size", 0) - packing_strategy = getattr(cfg_ps, "packing_strategy", "thd") - - # check if packed sequence is supported (only for thd strategy) - supports_seq_lens = _supports_seq_lens(model) - if packed_sequence_size > 0 and packing_strategy == "thd" and not supports_seq_lens: - logging.warning("Packed sequence is not supported without seq_lens; disabling packed sequence") - packed_sequence_size = 0 - - # Apply packing if configured - if packed_sequence_size > 0: - logger.info(f"Packing dataset with size: {packed_sequence_size}, strategy: {packing_strategy}") - if hasattr(ds, "shuffle"): - ds = ds.shuffle(seed) - - if packing_strategy == "neat": - from nemo_automodel.components.datasets.llm.neat_packing import neat_pack_dataset - from nemo_automodel.components.datasets.utils import neat_packed_collater - from nemo_automodel.components.models.common.packing import configure_packing, get_attn_implementation - - ds = neat_pack_dataset( - ds, - split=cfg_ds.split, - pack_size=packed_sequence_size, - max_packs=getattr(cfg_ps, "max_packs", None), - padding_idx=getattr(tokenizer, "pad_token_id", 0), - drop_long_samples=getattr(cfg_ps, "drop_long_samples", False), - ) - _attn_impl = get_attn_implementation(cfg_model) - configure_packing(attn_implementation=_attn_impl) - # Set collater with attn_implementation so it produces the right mask format - cfg_dl.collate_fn = lambda batch, _ai=_attn_impl: neat_packed_collater(batch, attn_implementation=_ai) - logger.info(f"Configured neat packing for attn_implementation={_attn_impl}") - else: - # "thd" — existing packing logic - ds = pack_dataset( - ds, - split=cfg_ds.split, - packed_sequence_size=packed_sequence_size, - max_packs=getattr(cfg_ps, "max_packs", None), - padding_idx=getattr(tokenizer, "pad_token_id", 0), - cp_size=cp_size, - ) - - if isinstance(ds, MegatronPretraining): - ds = ds.get_dataset(split=cfg_ds.splits_to_build) - dataloader_type = cfg_dl.get("dataloader_type", "single") - if "dataloader_type" in cfg_dl: - del cfg_dl.dataloader_type - batch_sampler = create_megatron_sampler( - dataset_len=len(ds), - micro_batch_size=local_batch_size, - global_batch_size=global_batch_size, - dataloader_type=dataloader_type, - rank=dp_rank, - world_size=dp_world_size, - ) - dl_kwargs = {"batch_sampler": batch_sampler} - elif not isinstance(ds, IterableDataset): - shuffle = cfg_dl.get("shuffle", True) - if "shuffle" in cfg_dl: - del cfg_dl.shuffle - - group_by_length = cfg_dl.get("group_by_length", False) - if "group_by_length" in cfg_dl: - del cfg_dl.group_by_length - - if group_by_length: - from nemo_automodel.components.datasets.llm.length_grouped_sampler import ( - LengthGroupedSampler as LLMLengthGroupedSampler, - ) - - sampler = LLMLengthGroupedSampler( - dataset=ds, - batch_size=local_batch_size, - seed=seed, - num_replicas=dp_world_size, - rank=dp_rank, - ) - else: - dist_sampler_kwargs = { - "num_replicas": dp_world_size, - "rank": dp_rank, - "shuffle": shuffle, - } - sampler = StatefulDistributedSampler( - ds, - seed=seed, - drop_last=True, - **dist_sampler_kwargs, - ) - dl_kwargs = {"sampler": sampler, "batch_size": local_batch_size} - if pp_enabled: - dl_kwargs["drop_last"] = True - else: - logging.info("Using IterableDataset; skipping sampler.") - # Optional shuffle for streaming IterableDataset (uses HF dataset shuffle if available) - shuffle = cfg_dl.get("shuffle", False) - shuffle_buffer_size = cfg_dl.get("shuffle_buffer_size", 10000) - # Do not pass shuffle-related kwargs to the DataLoader when using IterableDataset - # But leave them in dl config to be consistent - if hasattr(cfg_dl, "shuffle"): - del cfg_dl.shuffle - if hasattr(cfg_dl, "shuffle_buffer_size"): - del cfg_dl.shuffle_buffer_size - - if shuffle and hasattr(ds, "shuffle"): - try: - ds = ds.shuffle(buffer_size=shuffle_buffer_size, seed=seed) - logging.info(f"Shuffling IterableDataset with buffer_size={shuffle_buffer_size}, seed={seed}") - except Exception as e: - logging.warning(f"IterableDataset shuffle skipped due to error: {e}") - dl_kwargs = {} - - # Handle collate_fn with optional mask precomputation for pipeline parallelism - dl_kwargs = dl_kwargs | {"dataset": ds} - - # Handle collate_fn instantiation if it's a ConfigNode - if hasattr(cfg_dl, "collate_fn"): - if hasattr(cfg_dl.collate_fn, "_target_"): - collate_cfg = cfg_dl.collate_fn - dl_kwargs["collate_fn"] = lambda batch: collate_cfg.instantiate(batch=batch) - else: - dl_kwargs["collate_fn"] = cfg_dl.collate_fn - assert callable(dl_kwargs["collate_fn"]), "collate_fn must be callable" - - # Chain with mask precomputation if PP is enabled - if pp_enabled: - from nemo_automodel.components.datasets.utils import add_causal_masks_to_batch - - try: - hf_model_config = AutoConfig.from_pretrained( - _get_model_name(cfg_model), trust_remote_code=compute_trust_remote_code_from_model(cfg_model) - ) - except Exception: - logger.warning( - "Failed to load model config for causal mask precomputation. " - "Pipeline parallel mask precomputation will be skipped." - ) - else: - if "collate_fn" in dl_kwargs: - # Case 1: PP enabled + collate_fn exists -> chain them - # base_collate_fn -> add_causal_masks_to_batch - base_collate_fn = dl_kwargs["collate_fn"] - - def chained_collate_fn(batch, base_fn=base_collate_fn, config=hf_model_config): - batch = base_fn(batch) # Apply base collate (padding, batching, etc.) - batch = add_causal_masks_to_batch(batch, model_config=config) # Add masks - return batch - - dl_kwargs["collate_fn"] = chained_collate_fn - else: - # Case 2: PP enabled + no collate_fn -> only add masks - dl_kwargs["collate_fn"] = lambda batch, config=hf_model_config: add_causal_masks_to_batch( - batch, model_config=config - ) - - try: - import torch.multiprocessing as mp - - if mp.get_start_method(allow_none=True) is None: - mp.set_start_method("spawn", force=True) - except RuntimeError: - pass - return cfg_dl.instantiate(**dl_kwargs), tokenizer - - -def build_distributed(cfg_dist: Dict[str, Any]) -> "DistInfo": # noqa: F821 - """Build and initialize distributed training resources. - - Args: - cfg_dist: Configuration for distributed training. - - Returns: - Distributed training information from initialize_distributed. - """ - backend = cfg_dist.get("backend", "nccl") - timeout = cfg_dist.get("timeout_minutes", 1) - return initialize_distributed(backend=backend, timeout_minutes=timeout) - - -def build_step_scheduler(cfg, dataloader, dp_group_size, local_batch_size): - """Build the step scheduler. - - Args: - cfg: configuration for the StepScheduler class. - dataloader: the training dataloader, used for extracting the epoch_len (in batches). - dp_group_size: the size of the data parallel group. - micro_batch_size: the size of the micro batch. - - Returns: - StepScheduler: the configured StepScheduler. - """ - assert "_target_" not in cfg, "_target_ not permitted in step scheduler" - default_kwargs = dict( - num_epochs=10, - global_batch_size=32, - local_batch_size=local_batch_size, - dp_size=dp_group_size, - ckpt_every_steps=100, - dataloader=dataloader, - ) - if cfg is not None: - default_kwargs |= cfg.to_dict() - return StepScheduler(**default_kwargs) - - -def build_lr_scheduler(cfg, optimizer, step_scheduler) -> list[OptimizerParamScheduler] | None: # noqa: F821 - """Build the learning rate scheduler. - - Args: - cfg: Configuration for the OptimizerParamScheduler. - optimizer: The optimizer to be scheduled. - step_scheduler: The step scheduler to extract training parameters. - - Returns: - OptimizerParamScheduler: The configured learning rate scheduler, or None if not configured. - """ - if cfg is None: - return None - - # Calculate total steps for the training run - total_epochs = step_scheduler.num_epochs - epoch_len = len(step_scheduler.dataloader) - grad_acc_steps = step_scheduler.grad_acc_steps - - # Total optimizer steps (accounting for gradient accumulation) - total_steps = (total_epochs * epoch_len) // grad_acc_steps - if step_scheduler.max_steps is not None: - total_steps = min(total_steps, step_scheduler.max_steps) - - # Set defaults for scheduler parameters - optimizer_param_schedulers = [] - user_kwargs = cfg.to_dict() - default_kwargs = dict( - lr_warmup_steps=min(1000, total_steps // 10), # 10% warmup or max 1000 steps - lr_decay_steps=total_steps, - lr_decay_style="cosine", - wd_incr_steps=total_steps, - wd_incr_style="constant", - ) - - if not isinstance(optimizer, list): - optimizer = [optimizer] - - for opt in optimizer: - base_lr = opt.param_groups[0]["lr"] - default_kwargs.update( - dict( - optimizer=opt, - init_lr=base_lr * 0.1, # Start warmup at 10% of base LR - max_lr=base_lr, - min_lr=base_lr * 0.01, # End at 1% of base LR - start_wd=opt.param_groups[0].get("weight_decay", 0.0), - end_wd=opt.param_groups[0].get("weight_decay", 0.0), - ) - ) - default_kwargs.update(user_kwargs) - optimizer_param_schedulers.append(OptimizerParamScheduler(**default_kwargs)) - - logger.info( - f"Building LR scheduler with total_steps={total_steps}, " - f"warmup_steps={default_kwargs['lr_warmup_steps']}, " - f"decay_style={default_kwargs['lr_decay_style']}" - ) - - return optimizer_param_schedulers - - -def build_wandb(cfg) -> wandb.Run: - """Instantiates wandb and returns the instance. If no name is given, it will use the model name. - - Args: - cfg: Configuration for wandb. - - Returns: - The wandb instance. - """ - assert cfg.get("wandb", None) is not None - kwargs = cfg.wandb.to_dict() - if kwargs.get("name", "") == "": - kwargs["name"] = "_".join(_get_model_name(cfg.model).split("/")[-2:]) - run = wandb.init( - **kwargs, - config=cfg.to_dict(), - settings=Settings(silent=True), - ) - return run - - -def calculate_loss(loss_fn, **kwargs) -> torch.Tensor: - """Calculate the loss. - - Args: - loss_fn: Loss function. - **kwargs: Keyword arguments for the loss function. - - Returns: - The loss. - """ - loss_fn_kwargs = {"num_label_tokens": kwargs.pop("num_label_tokens", None)} - if isinstance(loss_fn, FusedLinearCrossEntropy): - model = kwargs.pop("model") - labels = kwargs.pop("labels") - - # find the lm_head in the model - lm_head = None - if hasattr(model, "get_output_embeddings"): - lm_head = model.get_output_embeddings().weight - else: - for n, p in model.named_parameters(remove_duplicate=False): - if "lm_head" in n and n.endswith(".weight"): - lm_head = p - break - if lm_head is None: - raise ValueError("lm_head.weight not found in model") - - # unshard the possibly sharded lm_head - lm_head = lm_head.full_tensor() if hasattr(lm_head, "full_tensor") else lm_head - loss_fn_kwargs.update( - { - "hidden_states": kwargs.pop("hidden_states"), - "labels": labels, - "lm_weight": lm_head, - } - ) - else: - loss_fn_kwargs.update( - { - "logits": kwargs.pop("logits"), - "labels": kwargs.pop("labels"), - } - ) - - return loss_fn(**loss_fn_kwargs) - - -def build_validation_dataloader(cfg, dp_world_size, dp_rank, pp_enabled, model: Optional[nn.Module] = None): - def _prepare_val_ds_name(val_ds_name): - val_ds_name = val_ds_name.replace("validation_dataset", "") - if len(val_ds_name) > 1 and val_ds_name[0] in ("_", "-", "."): - val_ds_name = val_ds_name[1:] - if val_ds_name == "": - val_ds_name = "default" - return val_ds_name - - # Build validation dataloader if the config provides it - val_dataloaders = {} - for val_ds_name in filter(lambda x: x.startswith("validation_dataset"), cfg.to_dict().keys()): - val_ds_cfg = cfg.get(val_ds_name, None) - val_ds_name = _prepare_val_ds_name(val_ds_name) - val_dataloaders[val_ds_name] = build_dataloader( - val_ds_cfg, - cfg.validation_dataloader, - cfg.model, - cfg_ps=cfg.get("packed_sequence", None) - if _uses_te_dot_product_attention(cfg.model) and _uses_thd_collater(cfg.dataloader) - else None, - seed=cfg.get("seed", 42), - local_batch_size=cfg.get("step_scheduler.local_batch_size", 1), - global_batch_size=cfg.get("step_scheduler.global_batch_size", 1), - max_steps=cfg.get("step_scheduler.max_steps", None), - val_check_interval=cfg.get("step_scheduler.val_every_steps", None), - dp_rank=dp_rank, - dp_world_size=dp_world_size, - pp_enabled=pp_enabled, - cp_size=cfg.get("distributed.cp_size", 1), - model=model, - )[0] - - return val_dataloaders - - -# --------------------------------------------------------------------------- -# Trainer class – orchestration only -# --------------------------------------------------------------------------- - - -class TrainFinetuneRecipeForNextTokenPrediction(BaseRecipe): - """Recipe for fine-tuning a model for next-token prediction. - - This class orchestrates training, from setup to main training loop. - """ - - def __init__(self, cfg): - """Initialize the recipe with configuration. - - Args: - cfg: Configuration dictionary/object for training. - """ - self.cfg = cfg - - # ------------------ build phase ------------------ - def setup(self): - """Builds all components needed for training/validation/logging/checkpointing/etc. - - This is the last place where self.cfg should be referenced. - - Raises: - NotImplemented: Raises if it tries to restore a checkpoint; will be removed. - """ - torch.cuda.reset_peak_memory_stats() - self.dist_env = build_distributed(self.cfg.get("dist_env", {})) - # setups logging and adds the rankfilter to logging - setup_logging() - - apply_cache_compatibility_patches() - apply_te_patches() - # Set up the stateful random number generator - self.rng = StatefulRNG(seed=self.cfg.get("seed", 42), ranked=True) - # Enable NVTX patching only when explicitly requested in config - self.enable_nvtx = bool(self.cfg.get("nvtx", False)) - - self.dist_setup = setup_distributed(self.cfg, world_size=self.dist_env.world_size) - self.distributed_config = self.dist_setup.strategy_config - self.device_mesh = self.dist_setup.device_mesh - self.moe_mesh = self.dist_setup.moe_mesh - self.pp_enabled = self.dist_setup.pp_enabled - self.pipeline_config = self.dist_setup.pipeline_config - - if self.dist_env.is_main and hasattr(self.cfg, "wandb"): - suppress_wandb_log_messages() - run = build_wandb(self.cfg) - logging.info("🚀 View run at {}".format(run.url)) - - self.mlflow_logger = None - if self.dist_env.is_main and hasattr(self.cfg, "mlflow"): - self.mlflow_logger = build_mlflow(self.cfg) - self.mlflow_logger.log_params(self.cfg.to_dict()) - logging.info("MLflow experiment tracking enabled") - - self.comet_logger = None - if self.dist_env.is_main and hasattr(self.cfg, "comet"): - self.comet_logger = build_comet(self.cfg) - self.comet_logger.log_params(self.cfg.to_dict()) - logging.info("Comet experiment tracking enabled") - - # Log experiment details on main rank - self._log_experiment_details() - self._log_library_versions() - - # Build loss_fn (will be set on pipeline_config if PP enabled) - self.loss_fn = build_loss_fn(self.cfg.loss_fn) - - # Pipeline runtime fields: override pp_batch_size and pp_microbatch_size - if self.pp_enabled: - pp_batch_size = self.cfg.step_scheduler.local_batch_size - pp_microbatch_size = self.cfg.get("distributed.pipeline.pp_microbatch_size", 1) - - assert pp_batch_size // pp_microbatch_size >= self.dist_setup.pp_size, ( - f"pp_batch_size {pp_batch_size} // pp_microbatch_size {pp_microbatch_size} must be >= pp_size {self.dist_setup.pp_size}" - ) - - # THD override logic - if ( - self.dist_setup.cp_size > 1 - and _uses_te_dot_product_attention(self.cfg.model) - and _uses_thd_collater(self.cfg.dataloader) - ): - pp_microbatch_size = 1 - pp_batch_size = pp_batch_size // self.cfg.get("distributed.pipeline.pp_microbatch_size", 1) - logging.info( - f"Overriding pp_batch_size: {pp_batch_size}, pp_microbatch_size: {pp_microbatch_size} for THD" - ) - - assert not isinstance(self.distributed_config, MegatronFSDPConfig), ( - "MegatronFSDPConfig is not supported when pipeline parallelism is enabled" - ) - - # Update pipeline_config runtime fields - self.pipeline_config.pp_batch_size = pp_batch_size - self.pipeline_config.pp_microbatch_size = pp_microbatch_size - self.pipeline_config.patch_stage_backward_maybe_with_nosync = self.cfg.get( - "model.backend.enable_fsdp_optimizations", False - ) - self.pipeline_config.loss_fn = self.loss_fn - - # Infer pp_seq_len from dataset config if not explicitly set - if hasattr(self.pipeline_config, "pp_seq_len") and self.pipeline_config.pp_seq_len is None: - packed_seq_size = self.cfg.get("packed_sequence.packed_sequence_size", 0) - if packed_seq_size > 0: - self.pipeline_config.pp_seq_len = packed_seq_size - elif self.cfg.get("dataset.seq_len", None) is not None: - self.pipeline_config.pp_seq_len = self.cfg.dataset.seq_len - - # Build components - self.peft_config = None - if self.cfg.get("peft", None) is not None: - self.peft_config = self.cfg.peft.instantiate() - - # Build checkpoint config - checkpoint_config = build_checkpoint_config( - self.cfg.get("checkpoint", None), - self.cfg.get("model.cache_dir", None), - _get_model_name(self.cfg.model), - True if self.cfg.get("peft", None) else False, - ) - - if self.cfg.get("clip_grad_norm.max_norm", None) is not None: - self.max_grad_norm = float(self.cfg.clip_grad_norm.max_norm) - else: - logging.info("No clip_grad_norm.max_norm specified in config, using default value of 1.0") - self.max_grad_norm = 1.0 - - # Create Checkpointer instance - self.checkpointer = Checkpointer( - config=checkpoint_config, - dp_rank=self._get_dp_rank(include_cp=True), - tp_rank=self._get_tp_rank(), - pp_rank=self._get_pp_rank(), - moe_mesh=self.moe_mesh, - ) - - # Disable fused RoPE when context parallelism is enabled (cp > 1) - if self.dist_setup.cp_size > 1 and self.cfg.get("model.backend.rope_fusion", False): - logging.info("Disabling rope_fusion because cp_size=%d > 1", self.dist_setup.cp_size) - self.cfg.model.backend.rope_fusion = False - - model = build_model( - self.cfg.model, - self.peft_config, - has_packed_sequence=self.cfg.get("packed_sequence.packed_sequence_size", 0) > 0, - seed=self.cfg.get("seed", 42), - cfg_fp8=self.cfg.get("fp8", None), - cfg_compile=self.cfg.get("compile", None), - cfg_quantization=self.cfg.get("quantization", None), - device_mesh=self.device_mesh, - moe_mesh=self.moe_mesh, - distributed_config=self.distributed_config, - pipeline_config=self.pipeline_config, - cfg_qat=self.cfg.get("qat", None), - cfg_moe=self.dist_setup.moe_config, - activation_checkpointing=self.dist_setup.activation_checkpointing, - sdpa_method=self.cfg.get("sdpa_method", None), - ) - self.optimizer = build_optimizer(model, self.cfg.optimizer, self.distributed_config, self.device_mesh) - - if not _supports_logits_to_keep(model) and not isinstance(self.loss_fn, MaskedCrossEntropy): - logger.warning("logits_to_keep not found in model.forward. Using MaskedCrossEntropy instead.") - self.loss_fn = MaskedCrossEntropy() - - if isinstance(model, AutoPipeline): - self.model_parts = model.parts - self.pp = model - if self.enable_nvtx: - import nemo_automodel.autonvtx as autonvtx - - # Patch each pipeline stage with NVTX profiling - for i, part in enumerate(self.model_parts): - autonvtx.patch(part, name=f"PipelineStage_{i}") - else: - if self.enable_nvtx: - import nemo_automodel.autonvtx as autonvtx - - # Patch model with NVTX profiling - autonvtx.patch(model, name=model.__class__.__name__) - self.model_parts = [model] - self.pp = None - - # Extract TE FP8 config from model backend (set after model construction) - self.te_fp8 = self.model_parts[0].backend.te_fp8 if hasattr(self.model_parts[0], "backend") else None - - _packed_seq_size = self.cfg.get("packed_sequence.packed_sequence_size", 0) - if self.dist_setup.cp_size > 1 and _packed_seq_size > 0: - _m = self.model_parts[0] - if hasattr(_m, "supports") and not _m.supports_cp_with_sequence_packing: - raise ValueError( - f"Context parallelism (cp_size={self.dist_setup.cp_size}) with packed sequences " - f"is not supported for {type(_m).__name__}.\n" - f"Either disable sequence packing:\n" - f" packed_sequence:\n" - f" packed_sequence_size: 0\n" - f"or switch to the TE attention backend -- MoE models only:\n" - f" model:\n" - f" backend:\n" - f" attn: te" - ) - - self.dataloader, self.tokenizer = build_dataloader( - self.cfg.dataset, - self.cfg.dataloader, - self.cfg.model, - self.cfg.get("packed_sequence", None), - seed=self.cfg.get("seed", 42), - local_batch_size=self.cfg.get("step_scheduler.local_batch_size", 1), - global_batch_size=self.cfg.get("step_scheduler.global_batch_size", 1), - max_steps=self.cfg.get("step_scheduler.max_steps", None), - val_check_interval=self.cfg.get("step_scheduler.val_every_steps", None), - dp_rank=self._get_dp_rank(), - dp_world_size=self._get_dp_group_size(), - pp_enabled=self.pp_enabled, - cp_size=self.cfg.get("distributed.cp_size", 1), - model=self.model_parts[0], - ) - self.val_dataloaders = build_validation_dataloader( - self.cfg, - self._get_dp_group_size(), - self._get_dp_rank(), - self.pp_enabled, - model=self.model_parts[0], - ) - self.best_metric_key = self.cfg.get("checkpoint.best_metric_key", "default") - # Scheduler - self.step_scheduler = build_step_scheduler( - self.cfg.get("step_scheduler", None), - self.dataloader, - self._get_dp_group_size(), - local_batch_size=self.cfg.get("step_scheduler.local_batch_size", 1), - ) - self._setup_garbage_collection(self.step_scheduler) - - # Build learning rate scheduler - self.lr_scheduler = build_lr_scheduler(self.cfg.get("lr_scheduler", None), self.optimizer, self.step_scheduler) - - # Log model, parameter counts, norms, optimizer and scheduler - self._log_model_and_optimizer_details(self.model_parts, self.optimizer, self.lr_scheduler) - - # Handle delayed fake-quant toggling for QAT if configured - self._qat_disable_fn, self._qat_enable_fn, self._qat_enable_after = self._setup_qat(self.cfg, self.model_parts) - - # Enable MoE load balance tracking if configured - moe_metrics_cfg = self.cfg.get("moe_metrics", None) - if moe_metrics_cfg and moe_metrics_cfg.get("enabled", False): - from nemo_automodel.components.moe.load_balance_metrics import enable_load_balance_tracking - - for mp in self.model_parts: - enable_load_balance_tracking(mp) - - self.mfu_calculator = AutoMFU.from_config(self.model_parts[0]) - - # NEFTune: noisy embeddings for improved instruction fine-tuning - neftune_cfg = self.cfg.get("neftune", None) - self.neftune = None - if neftune_cfg is not None: - from nemo_automodel.components.training.neftune import NEFTune - - noise_alpha = neftune_cfg.get("noise_alpha", 5.0) if hasattr(neftune_cfg, "get") else neftune_cfg - self.neftune = NEFTune(noise_alpha=float(noise_alpha)) - self.neftune.activate(self.model_parts[0]) - - restore_from = self.cfg.get("checkpoint.restore_from", None) - # Initialize JSONL loggers - self.metric_logger_train = build_metric_logger( - pathlib.Path(self.checkpointer.config.checkpoint_dir) / "training.jsonl" - ) - self.metric_logger_valid = { - name: build_metric_logger( - pathlib.Path(self.checkpointer.config.checkpoint_dir) - / (f"validation_{name}.jsonl" if name != "default" else "validation.jsonl") - ) - for name in self.val_dataloaders.keys() - } - - # Optionally resume - self.load_checkpoint(restore_from) - torch.cuda.empty_cache() - - # Log step scheduler details - self._log_step_scheduler_details(self.step_scheduler) - - def _collect_moe_load_balance(self): - """Collect MoE load balance metrics with DP all-reduce. - - Must be called on ALL ranks (the all-reduce is collective). - Stores the result in ``self._moe_layer_loads`` for rank-0 logging. - """ - moe_metrics_cfg = self.cfg.get("moe_metrics", None) - if not (moe_metrics_cfg and moe_metrics_cfg.get("enabled", False)): - self._moe_layer_loads = None - return - - from nemo_automodel.components.moe.load_balance_metrics import collect_expert_loads - - dp_group = self._get_dp_group(include_cp=True) - all_loads: dict = {} - for mp in self.model_parts: - all_loads.update(collect_expert_loads(mp, dp_group=dp_group)) - self._moe_layer_loads = all_loads if all_loads else None - - def _log_moe_metrics(self, step: int, wandb_log_fn) -> None: - """Log MoE load balance metrics to wandb. - - Call after :meth:`_collect_moe_load_balance`. Only logs when - ``_moe_layer_loads`` is populated and a wandb log function is provided. - - Args: - step: Current training/benchmark step for wandb x-axis. - wandb_log_fn: Callable like ``wandb.log`` or ``wandb_run.log``. - """ - if not getattr(self, "_moe_layer_loads", None): - return - - from nemo_automodel.components.moe.load_balance_metrics import ( - compute_brief_metrics, - compute_detailed_metrics, - ) - - moe_metrics_cfg = self.cfg.get("moe_metrics", None) - mode = moe_metrics_cfg.get("mode", "brief") if moe_metrics_cfg else "brief" - top_k = moe_metrics_cfg.get("top_k_experts", 0) if moe_metrics_cfg else 0 - if mode == "detailed": - detailed_every = moe_metrics_cfg.get("detailed_every_steps", None) if moe_metrics_cfg else None - if detailed_every is None or step % detailed_every == 0: - wandb_log_fn(compute_detailed_metrics(self._moe_layer_loads, top_k=top_k), step=step) - else: - wandb_log_fn(compute_brief_metrics(self._moe_layer_loads, top_k=top_k), step=step) - else: - wandb_log_fn(compute_brief_metrics(self._moe_layer_loads, top_k=top_k), step=step) - - def _setup_qat(self, cfg, model_parts: list[nn.Module]): - if not cfg.get("qat.enabled", False): - return None, None, None - from nemo_automodel.components.quantization.qat import ( - get_disable_fake_quant_fn, - get_enable_fake_quant_fn, - ) - - qat_cfg = cfg.qat - _qat_enable_after = qat_cfg.get("fake_quant_after_n_steps", 0) - # Collect mode from any model part that has it - qat_mode = getattr(model_parts[0], "_qat_mode", None) - - if qat_mode is None: - return None, None, None - - _qat_disable_fn = get_disable_fake_quant_fn(qat_mode) - _qat_enable_fn = get_enable_fake_quant_fn(qat_mode) - if _qat_disable_fn is not None and _qat_enable_after is not None: - try: - # start with fake-quant disabled, will enable later - for part in model_parts: - _qat_disable_fn(part) - logger.info("QAT fake-quant disabled initially; will enable after %s steps", _qat_enable_after) - except Exception as e: - logger.warning("Failed to disable fake-quant at setup: %s", e) - return _qat_disable_fn, _qat_enable_fn, _qat_enable_after - - def _enable_qat_if_delayed(self, step: int): - if getattr(self, "_qat_enable_after", None) is None: - return - if step < self._qat_enable_after or self._qat_enable_fn is None: - return - try: - for mp in self.model_parts: - self._qat_enable_fn(mp) - logger.info("Enabled QAT fake-quant after step %s", step) - # Enable one - self._qat_enable_after = None - except Exception as e: - logger.warning("Failed to enable fake-quant: %s", e) - - # ------------------ main loop ------------------ - def run_train_validation_loop(self): - """Run the training loop over all epochs and batches. - - For each batch, perform a forward pass, compute loss, backpropagate, - and update model parameters when necessary. Also prints loss every gradient step. - """ - for mp in self.model_parts: - mp.train() - self.timestamp = time.perf_counter() - - for epoch in self.step_scheduler.epochs: - self.step_scheduler.set_epoch(epoch) - # The step scheduler yields a list of batches with the following properties: - # 1. len(batches) == grad_acc_steps - # 2. len(batches[0]) == batch_size - for batches in self.step_scheduler: - # If QAT delayed fake-quant is configured, enable after threshold - self._enable_qat_if_delayed(self.step_scheduler.step) - train_log_data = self._run_train_optim_step(batches, self.max_grad_norm) - # Collect MoE load balance metrics (all ranks participate in all-reduce) - self._collect_moe_load_balance() - # log - self.log_train_metrics(train_log_data) - - # Run validation every val_every_steps - val_losses = {} - if self.step_scheduler.is_val_step: - for val_name, val_dataloader in self.val_dataloaders.items(): - val_log_data = self._run_validation_epoch(val_dataloader) - val_losses[val_name] = val_log_data.metrics["val_loss"] - self.log_val_metrics(val_name, val_log_data, self.metric_logger_valid[val_name]) - for mp in self.model_parts: - mp.train() - - # Save the checkpoint every ckpt_every_steps - if self.step_scheduler.is_ckpt_step: - self.save_checkpoint( - epoch, - self.step_scheduler.step, - train_log_data.metrics["loss"], - val_losses, - best_metric_key=self.best_metric_key, - ) - self._maybe_collect_garbage() - # Close JSONL loggers after training loop completes - self.metric_logger_train.close() - for v in self.metric_logger_valid.values(): - v.close() - - self.checkpointer.close() - - # ------------------ helpers ------------------ - def _forward_backward_step( - self, - idx, - batch, - *, - loss_buffer, - num_label_tokens, - num_batches, - is_train: bool = True, - ): - # Move batch to device (handle both tensors and dicts of tensors like causal_mask_mapping) - batch = { - k: ( - {dk: dv.to(self.dist_env.device, non_blocking=True) for dk, dv in v.items() if dv is not None} - if isinstance(v, dict) - else (v.to(self.dist_env.device, non_blocking=True) if isinstance(v, torch.Tensor) else v) - ) - for k, v in batch.items() - } - train_ctx, batch = make_cp_batch_and_ctx( - self.device_mesh, - batch, - use_te=_uses_te_dot_product_attention( - self.model_parts[0] if hasattr(self, "model_parts") else self.cfg.model - ) - and _uses_thd_collater(self.cfg.dataloader), - padding_token_id=self.tokenizer.pad_token_id if self.tokenizer else 0, - num_chunks=_get_num_thd_chunks(self.pp_enabled, self.cfg), - ) - labels = batch.pop("labels") - fp8_ctx = self.te_fp8.maybe_te_autocast() if self.te_fp8 is not None else nullcontext() - - if self.pp_enabled: - with train_ctx(), fp8_ctx: - losses = [] if self.pp.info.has_last_stage else None - if self.pp.info.has_last_stage: - masked_labels = labels.clone() - targets = masked_labels - else: - targets = None - - input_ids = batch.pop("input_ids") - - # Update PP stage shapes for the current batch's seq_len. - # This is a no-op when the length hasn't changed. - self.pp.update_seq_len(input_ids.shape[1]) - - # Filter out None values and empty dicts from batch to avoid PP chunking errors - batch_filtered = { - k: v for k, v in batch.items() if v is not None and not (isinstance(v, dict) and len(v) == 0) - } - - if is_train: - # Use step for training (forward + backward) - if self.pp.info.has_first_stage: - self.pp.info.schedule.step(input_ids, target=targets, losses=losses, **batch_filtered) - else: - self.pp.info.schedule.step(target=targets, losses=losses, **batch_filtered) - else: - # Use eval for validation (forward only, no backward) - if self.pp.info.has_first_stage: - self.pp.info.schedule.eval(input_ids, target=targets, losses=losses, **batch_filtered) - else: - self.pp.info.schedule.eval(target=targets, losses=losses, **batch_filtered) - - if self.pp.info.has_last_stage: - local_loss = torch.sum(torch.stack(losses)) - else: - local_loss = torch.tensor(0.0, device=self.dist_env.device) - - loss_buffer.append(local_loss.clone().detach()) - else: - model = self.model_parts[0] - sync_ctx = ( - get_sync_ctx( - model, - idx == num_batches - 1, - defer_fsdp_grad_sync=getattr(self.distributed_config, "defer_fsdp_grad_sync", True), - ) - if is_train - else nullcontext() - ) - with train_ctx(), sync_ctx, fp8_ctx: - batch = filter_forward_kwargs(model, batch) - if isinstance(self.loss_fn, FusedLinearCrossEntropy): - # use num_logits_to_keep to avoid full logits matrix in memory - out = model(logits_to_keep=1, **batch) - if "hidden_states" not in out: - raise ValueError( - "FusedLinearCrossEntropy requires the model to output hidden states. Set `model.output_hidden_states=True` in the config." - ) - else: - out = model(**batch) - - local_loss = calculate_loss( - self.loss_fn, - logits=getattr(out, "logits", out), - labels=labels, - model=model, - hidden_states=get_final_hidden_states(out), - num_label_tokens=num_label_tokens, - ) - loss_buffer.append(local_loss.clone().detach()) - if is_train: - (local_loss * self._get_dp_group_size(include_cp=True)).backward() - - def _run_train_optim_step(self, batches, max_grad_norm: Optional[float] = None): - """Execute a single training step. - - Args: - batches: List of batches of training data. - max_grad_norm: Gradient clipping norm. Optional, if None will not clip gradients. - """ - - num_label_tokens = torch.tensor( - sum((batch["labels"] != -100).sum().item() for batch in batches), dtype=torch.long - ) - num_label_tokens = self._dp_allreduce(num_label_tokens).item() - - # MoE aux loss gradients are injected via MoEAuxLossAutoScaler, which - # multiplies them by main_loss_backward_scale during backward. This - # counteracts the unwanted scaling that FSDP and PP post-hoc rescaling - # apply to *all* gradients (including aux loss): - # - # Non-PP: FSDP allreduce divides grads by dp_group_size. - # Scale = dp_group_size → net = 1. - # - # PP: FSDP divides by dp_group_size, then - # scale_grads_and_clip_grad_norm divides by - # (num_label_tokens / dp_group_size). The dp_group_size - # factors cancel, leaving net 1/num_label_tokens. - # Scale = num_label_tokens → net = 1. - if self.pp_enabled: - MoEAuxLossAutoScaler.main_loss_backward_scale = torch.tensor(float(num_label_tokens)) - else: - MoEAuxLossAutoScaler.main_loss_backward_scale = torch.tensor( - float(self._get_dp_group_size(include_cp=True)) - ) - - loss_buffer = [] - - # number of tokens in the batch, excluding any tail padding. - num_tokens_in_batch = torch.tensor( - sum(batch["labels"].numel() - count_tail_padding(batch["labels"]) for batch in batches), - dtype=torch.long, - ) - num_tokens_in_batch = self._dp_allreduce(num_tokens_in_batch).item() - - num_batches = len(batches) - prepare_for_grad_accumulation(self.model_parts, pp_enabled=self.pp_enabled) - - for i, batch in enumerate(batches): - if i == num_batches - 1: - prepare_for_final_backward(self.model_parts, pp_enabled=self.pp_enabled) - - self._forward_backward_step( - i, batch, loss_buffer=loss_buffer, num_label_tokens=num_label_tokens, num_batches=num_batches - ) - - if i == 0: - prepare_after_first_microbatch() - - grad_norm = scale_grads_and_clip_grad_norm( - max_grad_norm, - self.model_parts, - norm_type=2.0, - pp_enabled=self.pp_enabled, - device_mesh=self.device_mesh, - moe_mesh=self.moe_mesh, - ep_axis_name="ep" if self.moe_mesh is not None and "ep" in self.moe_mesh.mesh_dim_names else None, - pp_axis_name="pp" if self.pp_enabled else None, - foreach=True, - num_label_tokens=num_label_tokens, - dp_group_size=self._get_dp_group_size(include_cp=True), - ) - - # Note(MegatronFSDP): Need to call these functions for MegatronFSDP if not using latest api - # self.model_parts[0].finish_grad_sync() - - self.checkpointer.maybe_wait_for_staging() - for opt in self.optimizer: - opt.step() - opt.zero_grad() - - if hasattr(self.model_parts[0], "update_moe_gate_bias"): - for mp in self.model_parts: - mp.update_moe_gate_bias() - - if self.lr_scheduler is not None: - for scheduler in self.lr_scheduler: - scheduler.step(1) - - # Precompute FP8 scales - fp8_config = self.cfg.get("fp8", None) - if ( - fp8_config is not None - and fp8_config.get("enabled", False) - and fp8_config.get("precompute_float8_dynamic_scale_for_fsdp", False) - and not self.pp_enabled - and self.device_mesh is not None - and self.device_mesh["dp_shard"].size() > 1 - ): - precompute_float8_dynamic_scale_for_fsdp(self.model_parts[0]) - - # Note(MegatronFSDP): Need to call these functions for MegatronFSDP if not using latest api - # self.model_parts[0].install_optimized_model_weights() - # self.model_parts[0].zero_grad_buffer() - - t = time.perf_counter() - time_delta = t - self.timestamp - self.timestamp = t - tps = num_tokens_in_batch / time_delta - - mfu = None - mfu_calculator = getattr(self, "mfu_calculator", None) - if batches and mfu_calculator is not None: - step_flops = 0.0 - flops_supported = True - for batch in batches: - input_ids = batch.get("input_ids") - if input_ids is None: - flops_supported = False - break - batch_flops = mfu_calculator.get_flops(input_ids) - if batch_flops is None: - flops_supported = False - break - step_flops += float(batch_flops) - - if flops_supported: - step_flops = self._dp_allreduce( - torch.tensor(step_flops, dtype=torch.float64, device=self.dist_env.device), include_cp=True - ).item() - mfu = calculate_mfu(step_flops / 1e12, self.dist_env.world_size, time_delta) - - reporting_loss = torch.sum(torch.stack(loss_buffer)) - reporting_loss = self._dp_allreduce(reporting_loss, include_cp=True) - if self.pp_enabled: - reporting_loss = reporting_loss / num_label_tokens - reporting_loss = reporting_loss.to(self.dist_env.device) - # Send loss to first rank if pp group rank is 0 - src_rank = self.device_mesh.mesh.reshape(-1)[-1].item() - if self.dist_env.rank == src_rank: - torch.distributed.send(reporting_loss, dst=0) - elif self.dist_env.is_main: - torch.distributed.recv(reporting_loss, src=src_rank) - - reporting_loss = reporting_loss.cpu().item() - # fix reporting_loss, tps across ranks - - return MetricsSample( - step=self.step_scheduler.step, - epoch=self.step_scheduler.epoch, - metrics={ - "loss": reporting_loss, - "grad_norm": grad_norm, - "lr": self.optimizer[0].param_groups[0]["lr"], - "mem": torch.cuda.max_memory_allocated() / 1024**3, - "tps": tps, - "tps_per_gpu": tps / self._get_cp_group_size() / max(self._get_dp_group_size(), 1), - "mfu": mfu, - "num_tokens_per_step": num_tokens_in_batch, - "num_label_tokens": num_label_tokens, - }, - ) - - @torch.no_grad() - def _run_validation_epoch(self, val_dataloader): - """Run one pass over a single validation dataloader. - - Args: - val_name: Name of the validation dataset. - val_dataloader: DataLoader for the validation dataset. - """ - with ScopedRNG(seed=1, ranked=True): - for mp in self.model_parts: - mp.eval() - - total_loss = torch.tensor(0.0, dtype=torch.float32, device=self.dist_env.device) - total_num_label_tokens = 0 - - for batch in val_dataloader: - loss_buffer = [] - num_label_tokens = (batch["labels"] != -100).sum().item() - self._forward_backward_step( - 0, - batch, - loss_buffer=loss_buffer, - num_label_tokens=None, # we will normalize outside. - num_batches=1, - is_train=False, - ) - - total_loss += torch.sum(torch.stack(loss_buffer)).item() - total_num_label_tokens += num_label_tokens - - total_loss = self._dp_allreduce(total_loss, include_cp=True) - total_num_label_tokens = self._dp_allreduce( - torch.tensor(total_num_label_tokens, dtype=torch.long, device=self.dist_env.device) - ).item() - val_loss = total_loss / max(total_num_label_tokens, 1e-8) - - # For PP, send val_loss and num_label_tokens from last stage to main rank - if self.pp_enabled: - val_loss = val_loss.to(self.dist_env.device) - # On non-last ranks total_num_label_tokens is 0; this tensor is just a recv buffer. - pp_num_tokens = torch.tensor(total_num_label_tokens, dtype=torch.long, device=self.dist_env.device) - src_rank = self.device_mesh.mesh.reshape(-1)[-1].item() - if self.dist_env.rank == src_rank: - torch.distributed.send(val_loss, dst=0) - torch.distributed.send(pp_num_tokens, dst=0) - elif self.dist_env.is_main: - torch.distributed.recv(val_loss, src=src_rank) - torch.distributed.recv(pp_num_tokens, src=src_rank) - total_num_label_tokens = pp_num_tokens.item() - - val_loss = val_loss.item() if isinstance(val_loss, torch.Tensor) else val_loss - - return MetricsSample( - step=self.step_scheduler.step, - epoch=self.step_scheduler.epoch, - metrics={ - "val_loss": val_loss, - "lr": self.optimizer[0].param_groups[0]["lr"], - "num_label_tokens": total_num_label_tokens, - "mem": torch.cuda.max_memory_allocated() / 1024**3, - }, - ) - - def log_val_metrics(self, val_name, log_data, metric_logger=None): - """Log metrics to wandb, MLflow and other loggers - Args: - log_data: MetricsSample object, containing: - step: int, the current step. - epoch: int, the current epoch. - metrics: Dict[str, float], containing: - "val_loss": Validation loss. - "lr": Learning rate. - "num_label_tokens": Number of label tokens. - "mem": Memory allocated. - """ - - if not self.dist_env.is_main or log_data is None: - return - - if wandb.run is not None: - wandb.log(log_data.to_dict() | {"val_name": val_name}, step=log_data.step) - - if self.mlflow_logger is not None: - self.mlflow_logger.log_metrics(log_data.to_dict(), step=log_data.step) - - if self.comet_logger is not None: - self.comet_logger.log_metrics(log_data.to_dict() | {"val_name": val_name}, step=log_data.step) - - # JSONL validation log - if not metric_logger is None: - metric_logger.log(log_data) - - logging.info( - '[val] name "{}" | step {} | epoch {} | loss {:.4f} | lr {:.2e} | num_label_tokens {}'.format( - val_name, - log_data.step, - log_data.epoch, - log_data.metrics["val_loss"], - log_data.metrics["lr"], - log_data.metrics["num_label_tokens"], - ) - ) - - def log_train_metrics(self, log_data): - """Log metrics to wandb and other loggers. - - Args: - log_data: MetricsSample object, containing: - step: int, the current step. - epoch: int, the current epoch. - metrics: Dict[str, float], containing: - "loss": Training loss. - "grad_norm": Grad norm from the training step. - "lr": Learning rate. - "mem": Memory allocated. - "tps": Tokens per second. - "tps_per_gpu": Tokens per second per GPU. - "num_label_tokens": Number of label tokens. - """ - if not self.dist_env.is_main: - return - - # Log to remote services (WandB, MLflow, Comet) according to step_scheduler frequency - if self.step_scheduler.is_remote_logging_step: - if wandb.run is not None: - wandb.log(log_data.to_dict(), step=self.step_scheduler.step) - if self.mlflow_logger is not None: - self.mlflow_logger.log_metrics(log_data.to_dict(), step=log_data.step) - if self.comet_logger is not None: - self.comet_logger.log_metrics(log_data.to_dict(), step=log_data.step) - - # Log MoE load balance metrics (already collected/reduced on all ranks) - if self.step_scheduler.is_remote_logging_step: - if wandb.run is not None: - self._log_moe_metrics(self.step_scheduler.step, wandb.log) - if self.comet_logger is not None: - self._log_moe_metrics( - self.step_scheduler.step, lambda m, step: self.comet_logger.log_metrics(m, step=step) - ) - - # JSONL training log (always log for detailed local records) - self.metric_logger_train.log(log_data) - logging.info( - "step {} | epoch {} | loss {:.4f} | grad_norm {:.4f} | lr {:.2e} | mem {:.2f} GiB | tps {:.2f}({:.2f}/gpu) | num_label_tokens {}".format( - log_data.step, - log_data.epoch, - log_data.metrics["loss"], - log_data.metrics["grad_norm"], - log_data.metrics["lr"], - log_data.metrics["mem"], - log_data.metrics["tps"], - log_data.metrics["tps_per_gpu"], - log_data.metrics["num_label_tokens"], - ) - ) - torch.cuda.reset_peak_memory_stats() - - -# --------------------------------------------------------------------------- -# Entry point -# --------------------------------------------------------------------------- - - -def main(config_path=None): - """Main entry point for the fine-tuning recipe. - - Loads the configuration, sets up the trainer, and initiates the training loop. - """ - if config_path is None: - config_path = pathlib.Path(__file__).parent.resolve() / "llama_3_2_1b_hellaswag.yaml" - cfg = parse_args_and_load_config(config_path) - trainer = TrainFinetuneRecipeForNextTokenPrediction(cfg) - trainer.setup() - trainer.run_train_validation_loop() - - -if __name__ == "__main__": - main() - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/_peft/lora.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import math -from dataclasses import dataclass, field -from typing import Any, Literal, Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.distributed.tensor import DTensor -from torch.distributed.tensor.placement_types import Shard as _Shard - -from nemo_automodel.components._peft.lora_experts import GroupedExpertsDeepEPLoRA, GroupedExpertsLoRA -from nemo_automodel.components._peft.lora_kernel import ( - lora_da_dx_update_wrapper, - lora_db_update_wrapper, - lora_forward_wrapper, -) -from nemo_automodel.components._peft.module_matcher import ModuleMatcher -from nemo_automodel.components.moe.layers import GroupedExperts, GroupedExpertsDeepEP, GroupedExpertsTE -from nemo_automodel.shared.import_utils import safe_import, safe_import_te -from nemo_automodel.shared.utils import dtype_from_str - -HAS_BNB, bitsandbytes = safe_import("bitsandbytes") -HAS_TE, transformer_engine = safe_import_te() - -logger = logging.getLogger(__name__) - - -@dataclass -class PeftConfig: - target_modules: list = field(default_factory=list) - exclude_modules: list = field(default_factory=list) - match_all_linear: bool = False - dim: int = 8 - alpha: int = 32 - # Note: we currently support DoRA for nn.Linear only. - use_dora: bool = False - dropout: float = 0.0 - dropout_position: Literal["pre", "post"] = "post" - lora_A_init: str = "xavier" - lora_dtype: Optional[torch.dtype] = None - use_triton: bool = False - moe_rank_scaling: bool = False - - def to_dict(self): - return self.__dict__.copy() - - @classmethod - def from_dict(cls, d: dict[str, Any]): - return cls( - target_modules=d.get("target_modules", []), - exclude_modules=d.get("exclude_modules", []), - match_all_linear=d.get("match_all_linear", False), - dim=d.get("dim", 8), - alpha=d.get("alpha", 32), - use_dora=d.get("use_dora", False), - dropout=d.get("dropout", 0.0), - dropout_position=d.get("dropout_position", "post"), - lora_A_init=d.get("lora_A_init", "xavier"), - lora_dtype=d.get("lora_dtype", None), - use_triton=d.get("use_triton", False), - moe_rank_scaling=d.get("moe_rank_scaling", False), - ) - - -class LinearLoRA(nn.Linear): - """ - Linear + LoRA, maintains ckpts structure (i.e. Linear's weight/bias remain at the same FQN). - - The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to - use those inside LinearLoRA but also for monkey-patching modules, without repeating the - same code -> therefore those are decorated with @staticmethod. - """ - - def __init__( - self, - orig_linear, - dim=8, - alpha=32, - use_dora: bool = False, - dropout=0.0, - dropout_position="post", - lora_A_init_method="xavier", - lora_dtype=None, - ): - """ - LinearLora constructor. - - Args: - orig_linear (nn.Module): the linear module to augment. - dim (int): lora's dim in_features -> dim -> out_features. - alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.0). - dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) - lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) - lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they - are quantized weights (e.g. 4bit) needs to be specified explicitly. - """ - assert isinstance(orig_linear, nn.Linear) - super(LinearLoRA, self).__init__( - in_features=orig_linear.in_features, - out_features=orig_linear.out_features, - bias=orig_linear.bias is not None, - device=orig_linear.weight.device, - dtype=orig_linear.weight.dtype, - ) - # copy weights - self.weight.data.copy_(orig_linear.weight.data) - if orig_linear.bias is not None: - self.bias.data.copy_(orig_linear.bias.data) - # initialize the adapte - LinearLoRA._init_adapter( - self, - dim=dim, - alpha=alpha, - use_dora=use_dora, - dropout=dropout, - dropout_position=dropout_position, - lora_A_init_method=lora_A_init_method, - lora_dtype=lora_dtype, - ) - - @torch.no_grad - def init_lora_weights(self, init_method: str): - """ - Initialize the LoRA weights. - - Args: - init_method (str): Method to initialize the LoRA weights. - """ - if init_method == "xavier": - nn.init.xavier_normal_(self.lora_A.weight.data) - else: - nn.init.kaiming_uniform_(self.lora_A.weight.data, a=math.sqrt(5)) - self.lora_B.weight.data.fill_(0) - - @torch.no_grad - @staticmethod - def _init_adapter( - obj, - dim=8, - alpha=32, - use_dora: bool = False, - dropout=0.0, - dropout_position="post", - lora_A_init_method="xavier", - lora_dtype=None, - ): - """ - Adds LoRA weights to obj. Obj is either a LinearLoRA or an nn.Module (when monkey-patching). - - Args: - obj (LinearLoRA | nn.Module): input module to adapt. - dim (int): lora's dim in_features -> dim -> out_features. - alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.0). - dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) - lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) - lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they - are quantized weights (e.g. 4bit) needs to be specified explicitly. - """ - obj.dim = dim - obj.scale = alpha / dim - obj.use_dora = bool(use_dora) - - # Freezer - device = obj.weight.device - obj.weight.requires_grad = False - if obj.bias is not None: - obj.bias.requires_grad = False - - in_features = obj.in_features - out_features = obj.out_features - if isinstance(lora_dtype, str): - lora_dtype = dtype_from_str(lora_dtype) - assert lora_dtype is None or isinstance(lora_dtype, torch.dtype) - dtype = lora_dtype or obj.weight.dtype - - if HAS_TE and isinstance(obj, transformer_engine.pytorch.Linear): - obj.lora_A = transformer_engine.pytorch.Linear( - in_features=in_features, out_features=dim, bias=False, device=device, params_dtype=dtype - ) - obj.lora_B = transformer_engine.pytorch.Linear( - in_features=dim, out_features=out_features, bias=False, device=device, params_dtype=dtype - ) - else: - obj.lora_A = nn.Linear(in_features, dim, bias=False, dtype=dtype, device=device) - obj.lora_B = nn.Linear(dim, out_features, bias=False, dtype=dtype, device=device) - LinearLoRA.init_lora_weights(obj, lora_A_init_method) - obj.dropout_p = dropout - assert dropout_position in ["pre", "post"], ("dropout position can only be pre/post", dropout_position) - obj.dropout_position = dropout_position - - if obj.use_dora: - # initialize DoRA magnitude vector to ||W|| (row-wise L2 norm). - with torch.no_grad(): - weight_norm = torch.linalg.norm(obj.weight.data, dim=1).to(dtype=dtype, device=device) - obj.lora_magnitude = nn.Parameter(weight_norm, requires_grad=True) - - def _dora_weight_norm(self) -> torch.Tensor: - """ - Compute the detached weight norm used by DoRA. - """ - # ΔW = B @ A, shapes: [out, dim] @ [dim, in] => [out, in] - delta_w = (self.lora_B.weight @ self.lora_A.weight).detach().to(self.weight.dtype) - weight = self.weight.to(self.weight.dtype) - weight_norm = torch.linalg.norm(weight + self.scale * delta_w, dim=1).to(weight.dtype) - return weight_norm.detach() - - def forward(self, x): - """ - Forward pass through the original linear layer augmented with the LoRA pathway. - - Applies LoRA either before or after the dropout, depending on the configuration. - The result of the original linear transformation is combined with the LoRA output. - - Args: - x (Tensor): Input tensor of shape (batch_size, in_features). - - Returns: - Tensor: Output tensor of shape (batch_size, out_features). - """ - # pylint: disable=C0115,C0116 - # If LinearLoRA is used to monkey-patch a nn.Linear module, we want to use nn.Linear's - # forward in the case where it uses quantized weights. We store a reference to nn.Linear's - # forward in `super_fwd` attribute. If the attribute does not exist we do the usual linear. - if (fwd := getattr(self, "super_fwd", None)) is not None: - assert fwd != self.forward - res = fwd(x) - else: - # TE Linear can expose an empty .bias tensor (numel()==0) when bias=False; treat as no bias. - bias = self.bias - if bias is not None and bias.numel() == 0: - bias = None - # bmm avoids aten.view which cannot flatten a sharded dimension. - # F.linear calls view([b,s,h]->[b*s,h]) which fails when dim 0/1 is sharded - # (sequence parallelism) or during AOT-autograd tracing with compile. - _x_needs_bmm = ( - isinstance(x, DTensor) - and x.dim() == 3 - and any(isinstance(p, _Shard) and p.dim < 2 for p in x.placements) - ) - if torch.compiler.is_compiling() or _x_needs_bmm: - b = x.shape[0] - res = torch.bmm(x, self.weight.t().unsqueeze(0).expand(b, -1, -1)) - if bias is not None: - res = res + bias - else: - res = F.linear(x, self.weight, bias) - - if not self.use_dora: - if self.dropout_position == "pre": - x = F.dropout(x, p=self.dropout_p, training=self.training) - - # Apply scale before lora_B to keep lora_res as a Partial tensor. - # This allows both res and lora_res to remain Partial, so only one reduce-scatter is needed after addition. - # Multiplying after lora_B would convert Partial to Replicate, causing an extra reduce-scatter operation. - lora_res = self.lora_B(self.lora_A(x) * self.scale) - if self.dropout_position == "post": - lora_res = F.dropout(lora_res, p=self.dropout_p, training=self.training) - return res + lora_res - - if getattr(self, "lora_magnitude", None) is None: - raise RuntimeError("use_dora=True but lora_magnitude was not initialized") - - if self.dropout_position == "pre" and self.training and self.dropout_p > 0.0: - x_lora = F.dropout(x, p=self.dropout_p, training=True) - base_result = None - else: - x_lora = x - base_result = res - - lora_result = self.lora_B(self.lora_A(x_lora)) - if self.dropout_position == "post": - lora_result = F.dropout(lora_result, p=self.dropout_p, training=self.training) - - # Compute DoRA scaling factor. - weight_norm = self._dora_weight_norm() - mag = self.lora_magnitude.to(x.dtype) - weight_norm = weight_norm.to(x.dtype) - - # Broadcast magnitude scaling across batch/sequence dimensions. - mag_norm_scale = mag / weight_norm - if res.dim() == 3: - mag_norm_scale = mag_norm_scale.view(1, 1, -1) - else: - mag_norm_scale = mag_norm_scale.view(1, -1) - - # HF PEFT subtracts bias from base_result before applying scaling terms. - if base_result is not None: - bias = self.bias - if bias is not None and bias.numel() > 0: - base_no_bias = base_result - bias - else: - base_no_bias = base_result - else: - # Recompute base linear output without bias on x_lora (see HF PEFT DoraLinearLayer.forward). - base_no_bias = F.linear(x_lora, self.weight, None) - - dora_extra = (mag_norm_scale - 1) * base_no_bias + mag_norm_scale * lora_result * self.scale - return res + dora_extra - - -class TritonLinearLoRA(LinearLoRA): - """ - Subclass of LinearLoRA that uses triton kernels for forward and backward passes. - - Args: - orig_linear (nn.Module): the linear module to augment. - dim (int): lora's dim in_features -> dim -> out_features. - alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.0). - dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) - lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) - lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they - are quantized weights (e.g. 4bit) needs to be specified explicitly. - """ - - def forward(self, x): - """ - Forward function for LoRA with triton kernels. - - Args: - x (torch.Tensor): the input tensor. - - Returns: - torch.Tensor: the output tensor. - """ - # If LinearLoRA is used to monkey-patch a nn.Linear module, we want to use nn.Linear's - # forward in the case where it uses quantized weights. We store a reference to nn.Linear's - # forward in `super_fwd` attribute. If the attribute does not exist we do the usual linear. - if (fwd := getattr(self, "super_fwd", None)) is not None: - assert fwd != self.forward - res = fwd(x) - else: - res = F.linear(x, self.weight, self.bias) - - if self.dropout_position == "pre": - x = F.dropout(x, p=self.dropout_p, training=self.training) - lora_res = LoRATritonFunction.apply(x, self.lora_A.weight, self.lora_B.weight, self.scale, x.dtype) - if self.dropout_position == "post": - lora_res = F.dropout(lora_res, p=self.dropout_p, training=self.training) - - return res + lora_res - - -def patch_linear_module( - orig_linear, - dim=8, - alpha=32, - use_dora: bool = False, - dropout=0.0, - dropout_position="post", - lora_A_init_method="xavier", - lora_dtype=None, - use_triton=True, - layer_name=None, -): - """ - Monkey-patches a nn.Linear (orig_linear param) to be a LinearLoRA. - - The orig_linear might not contain valid weights, for example, the given orig_linear was - initialized within a context-manager that uses a "meta" device. Therefore, we cannot copy - the weight/bias from the orig_linear to the LinearLoRA, since those have not been allocated, - - To circumvent this scenario, LinearLoRA's additional functionality (_init_adapter, _forward) - is based on static functions, so that we can use them for patching or when allocating a - new LinearLoRA object. - - Args: - orig_linear (nn.Linear): the module we add adapter to. - dim (int, optional): Lora dim. Defaults to 8. - alpha (int, optional): Lora alpha scale. Defaults to 32. - dropout (float, optional): dropout prob. Defaults to 0.0. - dropout_position (str, optional): location to apply dropout wrt lora. - Defaults to 'post' (choices: 'pre', 'post'). - lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'. - lora_dtype (_type_, optional): Lora weights' dtype. By default will use orig_linear's dtype - but orig_linear might use non-trainable dtype (e.g., 4bit), in which case the user must - specify the dtype manually. Defaults to None. - use_triton (bool, optional): By default we use the triton kernel LoRA implementation. - - Returns: - (nn.Module): the monkey-patched (nn.Linear + LoRA) nn.Module - """ - linear_types = [nn.Linear] - if HAS_TE: - linear_types.append(transformer_engine.pytorch.Linear) - use_triton = False - if not isinstance(orig_linear, tuple(linear_types)): - raise NotImplementedError("Expected isinstance(orig_linear, nn.Linear)") - assert not hasattr(orig_linear, "super_fwd"), orig_linear.super_fwd - - if use_dora: - if HAS_TE and isinstance(orig_linear, transformer_engine.pytorch.Linear): - raise ValueError("DoRA is not supported for transformer_engine.pytorch.Linear layers.") - if getattr(orig_linear, "quant_state", None) is not None: - raise ValueError("DoRA is not supported for quantized linear layers (e.g., BitsAndBytes).") - use_triton = False - - linear_lora_cls = TritonLinearLoRA if use_triton else LinearLoRA - linear_lora_cls._init_adapter( - orig_linear, - dim=dim, - alpha=alpha, - use_dora=use_dora, - dropout=dropout, - dropout_position=dropout_position, - lora_A_init_method=lora_A_init_method, - lora_dtype=lora_dtype, - ) - cls = orig_linear.__class__ - new_cls = type("PatchedLinearLoRA", (linear_lora_cls, cls), {}) - - # If the model uses quantized weights, we want to use orig_linear's forward - if ( - getattr(orig_linear, "quant_state", None) is not None - and orig_linear.quant_state.__class__ == bitsandbytes.functional.QuantState - ): - if HAS_TE: - assert not isinstance(orig_linear, transformer_engine.pytorch.Linear), ( - "quant_state is not supported with transformer_engine.pytorch.Linear" - ) - orig_linear.super_fwd = orig_linear.forward - elif HAS_TE and isinstance(orig_linear, transformer_engine.pytorch.Linear): - # Delegate base computation to TE's forward so TE kernels (including FP8) - # are used instead of falling back to F.linear(). - orig_linear.super_fwd = orig_linear.forward - - orig_linear.__class__ = new_cls - if layer_name is not None: - orig_linear._layer_name = layer_name - return orig_linear - - -def patch_moe_module( - orig_module, - dim=8, - alpha=32, - lora_A_init_method="xavier", - lora_dtype=None, -): - """ - Patches a custom MoE module (GroupedExperts or GroupedExpertsDeepEP) with LoRA. - - Args: - orig_module (nn.Module): The original MoE module to be patched. - dim (int, optional): LoRA rank (dimension). Defaults to 8. - alpha (int, optional): LoRA scaling factor. Defaults to 32. - lora_A_init_method (str, optional): Initialization method for LoRA A matrix. Defaults to "xavier". - lora_dtype (torch.dtype or str, optional): Data type for LoRA weights. Defaults to None. - - Returns: - nn.Module: The LoRA-wrapped MoE module (GroupedExpertsLoRA or GroupedExpertsDeepEPLoRA). - """ - if isinstance(orig_module, GroupedExpertsTE): - raise NotImplementedError("LoRA is not supported for Transformer Engine (TE) expert modules.") - elif isinstance(orig_module, GroupedExpertsDeepEP): - new_module = GroupedExpertsDeepEPLoRA( - orig_module, - lora_dim=dim, - alpha=alpha, - lora_A_init_method=lora_A_init_method, - lora_dtype=lora_dtype, - ) - elif isinstance(orig_module, GroupedExperts): - new_module = GroupedExpertsLoRA( - orig_module, - lora_dim=dim, - alpha=alpha, - lora_A_init_method=lora_A_init_method, - lora_dtype=lora_dtype, - ) - else: - raise NotImplementedError(f"Unsupported MoE module type: {type(orig_module)}") - - return new_module - - -# patch a model in-place -def apply_lora_to_linear_modules( - model: nn.Module, - peft_config: PeftConfig, - quantization_config=None, - skip_freeze: bool = False, -) -> int: - """ - Replace selected nn.Linear layers with LinearLoRA layers (in-place). - - Args: - model: The model to apply LoRA to. - peft_config: PEFT configuration for LoRA parameters. - quantization_config: Optional separate QLoRA quantization configuration. - skip_freeze: If True, skip the global parameter freeze (caller will handle it later). - - Returns: - Number of modules that were modified with LoRA. - - Note: - target_modules accepts wildcard fragments, e.g. ["q_proj", "k_proj", ".*fc.*"]. - """ - # Freeze base model parameters - if not skip_freeze: - for w in model.parameters(): - w.requires_grad_(False) - - is_causal_lm = False - try: - if ( - hasattr(model, "config") - and model.config.architectures is not None - and len(model.config.architectures) > 0 - and "CausalLM" in model.config.architectures[0] - ): - # for example, LlamaForCausalLM - is_causal_lm = True - except (AttributeError, TypeError): - is_causal_lm = False - - matcher = ModuleMatcher( - peft_config.target_modules, peft_config.exclude_modules, peft_config.match_all_linear, is_causal_lm - ) - num_modules_matched = 0 - for name, module in list(model.named_modules()): - if isinstance(module, (GroupedExperts, GroupedExpertsDeepEP, GroupedExpertsTE)): - if matcher.match(module, name): - if peft_config.use_dora: - raise NotImplementedError("DoRA is not supported for MoE expert modules in Automodel yet.") - num_modules_matched += 1 - lora_dtype = peft_config.lora_dtype - if quantization_config is not None and lora_dtype is None: - lora_dtype = quantization_config.bnb_4bit_compute_dtype or torch.bfloat16 - - # Compute effective LoRA rank for MoE modules - moe_dim = peft_config.dim - if peft_config.moe_rank_scaling: - n_act = module.config.n_activated_experts - moe_dim = peft_config.dim // n_act - if moe_dim < 1: - raise ValueError( - f"moe_rank_scaling: dim={peft_config.dim} // n_activated_experts={n_act} " - f"gives rank {moe_dim}. Increase dim to at least n_activated_experts." - ) - if peft_config.dim % n_act != 0: - logger.warning( - "moe_rank_scaling: dim=%d is not evenly divisible by n_activated_experts=%d; " - "using floor division rank=%d.", - peft_config.dim, - n_act, - moe_dim, - ) - - # Replace the module in the model - new_module = patch_moe_module( - module, - dim=moe_dim, - alpha=peft_config.alpha, - lora_A_init_method=peft_config.lora_A_init, - lora_dtype=lora_dtype, - ) - - # Find parent and replace - if "." not in name: - setattr(model, name, new_module) - else: - parent_name, child_name = name.rsplit(".", 1) - parent = model.get_submodule(parent_name) - setattr(parent, child_name, new_module) - else: - # Standard Linear patching - linear_types = [nn.Linear] + ([transformer_engine.pytorch.Linear] if HAS_TE else []) - if isinstance(module, tuple(linear_types)) and matcher.match(module, name): - num_modules_matched += 1 - # For QLora, set lora_dtype to float16/bfloat16 since base weights are quantized - lora_dtype = peft_config.lora_dtype - if quantization_config is not None and lora_dtype is None: - lora_dtype = quantization_config.bnb_4bit_compute_dtype or torch.bfloat16 - - patch_linear_module( - module, - dim=peft_config.dim, - alpha=peft_config.alpha, - use_dora=peft_config.use_dora, - dropout=peft_config.dropout, - dropout_position=peft_config.dropout_position, - lora_A_init_method=peft_config.lora_A_init, - lora_dtype=lora_dtype, - use_triton=peft_config.use_triton, - layer_name=name, - ) - - return num_modules_matched - - -class LoRATritonFunction(torch.autograd.Function): - """ - Autograd function that calls the triton kernel wrappers for the LoRA forward and backward passes. - """ - - @staticmethod - def setup_context(ctx, inputs, output): - """ - Stores context for LoRA backward pass. - """ - x, lora_A, lora_B, scale, _ = inputs - ctx.save_for_backward(x, lora_A, lora_B) - ctx.scale = scale - - @staticmethod - def forward(x, lora_A, lora_B, scale, dtype): - """ - Forward method for LoRATriton. - - Reshapes 3D tensors into 2D and then calls the triton kernel. - """ - reshape = x.dim() == 3 - if reshape: - bs, seq_len, d = x.shape - x = x.reshape(-1, d) - - lora_res = lora_forward_wrapper(x, lora_A.t(), lora_B.t(), res=None, scale=scale, dtype=dtype) - - if reshape: - return lora_res.view(bs, seq_len, -1) - else: - return lora_res - - @staticmethod - def backward(ctx, d_y): - """ - Backward method for LoRATriton. - - Reshapes 3D tensors into 2D and then calls the kernels to update d_lora_a, d_lora_b, and dx. - """ - x, lora_A, lora_B = ctx.saved_tensors - scale = ctx.scale - dtype = x.dtype - - reshape = x.dim() == 3 - if reshape: - bs, seq_len, d = x.shape - d_y = d_y.reshape(-1, d_y.shape[-1]) - x = x.reshape(-1, d) - - d_lora_A, d_x = lora_da_dx_update_wrapper(x.t(), d_y, lora_B, lora_A, scale, dtype=dtype) - d_lora_B = lora_db_update_wrapper(lora_A, x.t(), d_y, scale, dtype) - - if reshape: - d_x = d_x.view(bs, seq_len, d) - return d_x, d_lora_A.t(), d_lora_B, None, None - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/_peft/module_matcher.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import re -from dataclasses import dataclass, field -from typing import List - -import torch.nn as nn - -from nemo_automodel.shared.import_utils import safe_import_te - -HAS_TE, transformer_engine = safe_import_te() -import logging - -logger = logging.getLogger(__name__) -from functools import lru_cache - - -def _is_linear_module(module): - return isinstance(module, nn.Linear) or (HAS_TE and isinstance(module, transformer_engine.pytorch.Linear)) - - -@lru_cache(maxsize=1000) -def _compile_wildcard_pattern(pattern): - pattern = re.sub(r"(? (.*) - return re.compile("^" + pattern + "$") - - -def wildcard_match(pattern, key): - """ - Return whether the pattern (target module to add LoRA) matches the key (model weight name). - - Example: - -------- - >>> wildcard_match("*.layers.0.*.linear_qkv", "decoder.layers.0.self_attention.linear_qkv") - True - >>> wildcard_match("*.layers.0.*.linear_qkv", "decoder.layers.1.self_attention.linear_qkv") - False - """ - if key is None: - return False - regex_pattern = _compile_wildcard_pattern(pattern) - match = regex_pattern.match(key) - return match is not None - - -@dataclass -class ModuleMatcher: - """ - Matches Modules to apply PEFT adapters on. - - Args: - target_modules (List[str], optional): A list of module names to apply LoRA to. - Defaults to an empty list. - If empty and no other parameter is provided it will match to "*_proj". - Target modules can also contain wildcards (e.g. "*.layers.0.*.linear_qkv"). For example, you can specify - target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv - on the first two layers. - exclude_modules (List[str], optional): A list of module names to exclude from applying LoRA to. - Defaults to an empty list. - Exclude modules can also contain wildcards (e.g. "*.lm_head"). For example, you can specify - exclude_modules=['*.lm_head'] to exclude the lm_head. - match_all_linear (bool, optional): Whether to match all linear layers. - Defaults to False. Prefer using target_modules or exclude_modules to specify the modules to match, - to avoid issues with downstream tools (e.g., vLLM, etc). - is_causal_lm (bool, optional): Whether the model is a causal language model. - """ - - target_modules: List[str] = field(default_factory=list) - exclude_modules: List[str] = field(default_factory=list) - match_all_linear: bool = field(default=False) - is_causal_lm: bool = field(default=False) - - def __post_init__(self): - """ - Input validation. - """ - if self.target_modules is None: - self.target_modules = [] - if self.exclude_modules is None: - self.exclude_modules = [] - if isinstance(self.target_modules, str): - self.target_modules = [self.target_modules] - if isinstance(self.exclude_modules, str): - self.exclude_modules = [self.exclude_modules] - if self.match_all_linear is False and len(self.target_modules) == 0 and len(self.exclude_modules) == 0: - logger.warning( - "No modules specified for LoRA. Will use target_modules='*_proj' by default." - """ - Equivalent to the following YAML configuration: - peft: - target_modules: '*_proj' - If this is not what you want, please specify target_modules or exclude_modules. - """ - ) - self.target_modules = ["*_proj"] - - if self.target_modules and self.exclude_modules: - raise ValueError( - "target_modules and exclude_modules are mutually exclusive. Please provide only one of them." - ) - if self.match_all_linear and (len(self.target_modules) > 0 or len(self.exclude_modules) > 0): - raise ValueError( - "Expected target_modules/exclude_modules to be empty when match_all_linear is true. Please provide only one of them." - ) - if self.match_all_linear: - logger.warning( - "match_all_linear is true. This will match all linear layers in the model (including lm_head). " - "Please consider using target_modules or exclude_modules to specify the modules to match, to avoid issues with downstream tools " - "For example, to match all linear layers except the lm_head, you can use: " - "peft: " - " target_modules: '*_proj' " - ) - - # --------------------------------------------------------------------- # - # Public API # - # --------------------------------------------------------------------- # - def match(self, m: nn.Module, name: str = None, prefix: str = None): - """ - Return (pattern, full_name) if the module matches; otherwise None. - """ - full_name = f"{prefix}.{name}" if prefix else name - - # 1. matching by layer type takes absolute precedence - if self.match_all_linear and _is_linear_module(m): - return True - - # 2. target_modules is the next most-specific rule set - elif self.target_modules: - assert not self.exclude_modules, "`exclude_modules` must be empty when `target_modules` is used." - for pattern in self.target_modules: - if name == pattern or wildcard_match(pattern, full_name): - return True - return False - # 3. Fallback: “all linear layers except those explicitly excluded” - else: - return ( - name not in self.exclude_modules - and not any(wildcard_match(pattern, full_name) for pattern in self.exclude_modules) - and _is_linear_module(m) - ) - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/datasets/utils.py -```py -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Optional - -import torch -from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask - - -def batchify(tensor, default_tensor_cls=torch.LongTensor): - """ - Ensures that the input tensor has at least two dimensions by adding an extra batch dimension if necessary. - - Args: - tensor (torch.Tensor): The input tensor to be batchified. - - Returns: - torch.Tensor: The tensor with an extra dimension added if it was originally 1-dimensional. - Otherwise, the tensor is returned as-is. - """ - if not isinstance(tensor, torch.Tensor): - tensor = default_tensor_cls(tensor) - if tensor.ndim == 1: - return tensor.unsqueeze_(0) - return tensor - - -def extract_key_from_dicts(batch, key): - """ - Extracts the value of the given key from each dictionary in a list of dictionaries. - - Args: - batch (List[dict]): A list of dictionaries. - key (str): The key whose values are to be extracted from each dictionary. - - Returns: - List: A list of values associated with the specified key, in the same order as - the dictionaries in the input batch. - """ - return list(map(lambda x: x[key], batch)) - - -def pad_within_micro(batch, pad_token_id, pad_seq_len_divisible=None): - """ - Pads each list in a batch of lists to the same length with a specified token. - - Args: - batch (List[List[int]]): A batch of sequences (e.g., token IDs), where each sequence - is a list of integers. - pad_token_id (int): The token ID to use for padding shorter sequences. - pad_seq_len_divisible (int): The value to use for padding sequence length so that it is - divisible by pad_seq_len_divisible. - - Returns: - List[List[int]]: A batch of sequences where each inner list has been padded with the pad - token to match the length of the longest sequence in the batch. - """ - max_len = max(map(len, batch)) - if pad_seq_len_divisible: - max_len = math.ceil(max_len / pad_seq_len_divisible) * pad_seq_len_divisible - if pad_token_id is None: - # if it's none, extend the last token - pad_token_id = batch[0][-1] - return [item + [pad_token_id] * (max_len - len(item)) for item in batch] - - -def find_last_non_pad_token(lst: list[int], value: int) -> int | None: - # lst = [optional-value .., non-value, ..., non-value, value, ...] - # return the index of the last non-value token - i = len(lst) - 1 - found = False - while i >= 0: - if lst[i] == value: - i -= 1 - found = True - else: - if found: - return i - else: - return None - return None - - -def get_pad_token_from_key(val: str, pad_token_ids: Optional[dict[str, int]] = None) -> int | None: - PAD_TOKEN_IDS = { - "labels": -100, - "attention_mask": 0, - "loss_mask": 0, - "input_ids": 0, - } - if pad_token_ids is None: - pad_token_ids = {} - ans = pad_token_ids.get(val, PAD_TOKEN_IDS.get(val, None)) - return ans - - -def make_attention_mask_from_labels(ids: list[int], ignore_token: int = -100) -> list[int]: - # if the last token is not an ignore token, then the attention mask is all 1s - if len(ids) == 0: - return [] - if ids[-1] != ignore_token: - ans = [1] * len(ids) - else: - # otherwise, find the last non-pad token and set the attention mask to 1s up to that point - last_non_pad_token_pos = find_last_non_pad_token(ids, ignore_token) - if last_non_pad_token_pos is None: - ans = [1] * len(ids) - else: - ans = [1] * (last_non_pad_token_pos + 1) - ans = ans + [0] * (len(ids) - len(ans)) - assert len(ans) == len(ids) - return ans - - -def create_causal_mask_mapping( - model_config, - batch_size, - seq_len, - position_ids=None, - attention_mask=None, - device=None, -): - """ - Create causal mask mapping for pipeline parallelism. - - This is the core mask creation logic that can be reused by different collate functions. - Extracts common mask creation logic to avoid duplication between collate functions. - - Args: - model_config: HuggingFace model config - batch_size: Batch size - seq_len: Sequence length - position_ids: Optional position IDs tensor [batch_size, seq_len] - attention_mask: Optional 2D attention mask tensor [batch_size, seq_len] for padding - device: Device to create tensors on (defaults to cpu) - - Returns: - dict: Mapping of mask types to 4D mask tensors - - "full_attention": [batch_size, 1, seq_len, seq_len] - - "sliding_attention": [batch_size, 1, seq_len, seq_len] (if model uses sliding window) - """ - if device is None: - device = torch.device("cpu") - - # Create position_ids if not provided - if position_ids is None: - position_ids = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1) - - # Prepare mask creation kwargs - mask_kwargs = { - "config": model_config, - "input_embeds": torch.empty((batch_size, seq_len), device=device), - "attention_mask": attention_mask, - "cache_position": position_ids[0], # Use first row (all rows identical for non-padded data) - "past_key_values": None, # Training only - "position_ids": position_ids, - } - - # Create causal masks - causal_mask_mapping = { - "full_attention": create_causal_mask(**mask_kwargs), - } - - # Add sliding window mask if model uses it - if hasattr(model_config, "sliding_window") and model_config.sliding_window is not None: - causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) - - return causal_mask_mapping - - -def add_causal_masks_to_batch(batch_dict, model_config): - """ - Add precomputed causal masks to an already-batched data dict. - - This function is designed for datasets that yield complete batches (like MockIterableDataset), - where we want to add mask precomputation as a separate processing step. - - Args: - batch: A dict or list containing a single batched dict with tensors: - - input_ids: [batch_size, seq_length] - - position_ids: [batch_size, seq_length] (optional) - - labels: [batch_size, seq_length] - model_config: HuggingFace model config for creating causal masks - precompute_masks: If False, skip mask creation (for compatibility with train_ft.py wrapper) - - Returns: - dict: Same batch with added causal_mask_mapping field - """ - # Extract info from batch - batch_size = batch_dict["input_ids"].shape[0] - seq_len = batch_dict["input_ids"].shape[1] - position_ids = batch_dict.get("position_ids") - attention_mask = batch_dict.get("attention_mask") # May have padding info - - # Create causal masks using the shared helper function - causal_mask_mapping = create_causal_mask_mapping( - model_config=model_config, - batch_size=batch_size, - seq_len=seq_len, - position_ids=position_ids, - attention_mask=attention_mask, - device=batch_dict["input_ids"].device, - ) - - batch_dict["causal_mask_mapping"] = causal_mask_mapping - return batch_dict - - -def default_collater(batch, pad_seq_len_divisible=None): - """ - Default batch collator that handles padding and batching. - - Args: - batch: A batch of examples. - pad_seq_len_divisible: If provided, pad sequence length to be divisible by this value. - - Returns: - dict: A dictionary containing batched tensors. - """ - pad_token_ids = batch[0].pop("___PAD_TOKEN_IDS___", None) - # ans contains a dict with: - # key: str (e.g., "input_ids", "attention_mask", "labels", "loss_mask") - # value: list[list[int]] (e.g., [[1, 2, 3], [4, 5, 6]]) - ans = { - key: pad_within_micro( - extract_key_from_dicts(batch, key), - get_pad_token_from_key(key, pad_token_ids), - pad_seq_len_divisible, - ) - for key in batch[0].keys() - } - - # convert to tensors - result = {k: batchify(torch.LongTensor(v)) for k, v in ans.items()} - - # Add padding_mask similar to cp_utils.py - if "input_ids" in result: - input_ids_pad_token = get_pad_token_from_key("input_ids", pad_token_ids) or 0 - result["padding_mask"] = (result["input_ids"] == input_ids_pad_token).bool() - - return result - - -def packed_sequence_thd_collater(batch): - """ - Collater for packed sequences in THD (total, hidden, depth) format. - - This collater is designed for THD format, where multiple variable-length - sequences are concatenated with/without padding tokens between them. The THD format represents - sequences as (total_tokens, hidden_dim, depth) where total_tokens is the sum of all sequence - lengths in the batch. - - Unlike traditional padding-based approaches (BSHD/SBHD formats), this THD format: - - Concatenates sequences directly: [a a a b b c c c c] - - Uses seq_lens to identify sequence boundaries for attention computation - - Supports optional identifier or padding tokens between sequences via seq_lens_padded - - This collater supports both pipeline parallelism (PP) and non-PP use cases by: - - Stacking token-level tensors (input_ids, labels, position_ids) along batch dimension - - Padding and stacking seq_lens and seq_lens_padded with sentinel value -1000 - - Including 'qkv_format': 'thd' in the output to indicate THD format - - When batch items lack packed-sequence metadata (seq_lens, seq_lens_padded, position_ids), - such as samples from ChatDataset, this collater synthesizes the missing fields so that each - sample is treated as a single-sequence "pack". Variable-length sequences are padded to the - longest length in the batch. This enables using THD format with TE context parallelism - without requiring the dataset to perform actual sequence packing. - - Args: - batch (List[dict]): A list of dictionaries, where each dictionary represents one example. - - For pre-packed data, each dictionary should contain: - - 'input_ids': List[int] - Token IDs for all packed sequences (must be same length across batch) - - 'labels': List[int] - Labels for all packed sequences (must be same length across batch) - - 'position_ids': List[int] - Position IDs for all tokens (must be same length across batch) - - 'seq_lens': List[int] - Actual sequence lengths for each packed sequence - - 'seq_lens_padded': List[int] - Sequence lengths including identifier/padding tokens - - For non-packed data (e.g. ChatDataset), each dictionary needs only: - - 'input_ids': List[int] - Token IDs (variable length across batch) - - 'labels': List[int] - Labels (same length as input_ids) - - 'attention_mask': List[int] - (optional) 1 for real tokens, 0 for padding - - Example batch with 2 packed examples, both with 6 total tokens: - [ - { - 'input_ids': [1, 2, 3, 99, 4, 5], # Two sequences: [1,2,3] and [4,5] with sep token 99 - 'labels': [1, 2, 3, -100, 4, 5], - 'position_ids': [0, 1, 2, 0, 0, 1], - 'seq_lens': [3, 2], # Actual sequence lengths (excluding separator) - 'seq_lens_padded': [4, 2] # Including separator token - }, - { - 'input_ids': [6, 7, 99, 8, 9, 10], # Two sequences with separator - 'labels': [6, 7, -100, 8, 9, 10], - 'position_ids': [0, 1, 0, 0, 1, 2], - 'seq_lens': [2, 3], - 'seq_lens_padded': [3, 3] - } - ] - - Returns: - dict: A dictionary with batched tensors: - - 'input_ids': tensor of shape [batch_size, seq_len] - stacked token sequences - - 'labels': tensor of shape [batch_size, seq_len] - stacked labels - - 'position_ids': tensor of shape [batch_size, seq_len] - stacked position IDs - - 'seq_lens': tensor of shape [batch_size, max_num_packs] - padded sequence lengths - - 'seq_lens_padded': tensor of shape [batch_size, max_num_packs] - padded lengths with separators - - 'qkv_format': str - Always 'thd' to indicate THD format - - Note: seq_lens and seq_lens_padded are padded with -1000 to handle variable number of - packed sequences per example. These sentinel values should be filtered out before use. - """ - # Extract and remove padding token metadata if present - pad_token_ids = None - if len(batch) > 0 and "___PAD_TOKEN_IDS___" in batch[0]: - pad_token_ids = batch[0].get("___PAD_TOKEN_IDS___") - for item in batch: - item.pop("___PAD_TOKEN_IDS___", None) - - if len(batch) == 0: - return {} - - # If batch items lack packed-sequence metadata (e.g. from ChatDataset), - # synthesize seq_lens, seq_lens_padded, and position_ids so that each - # sample is treated as a single-sequence "pack". - if "seq_lens" not in batch[0]: - input_ids_pad = get_pad_token_from_key("input_ids", pad_token_ids) or 0 - max_len = max(len(item["input_ids"]) for item in batch) - - for item in batch: - cur_len = len(item["input_ids"]) - if "attention_mask" in item: - actual_len = sum(item["attention_mask"]) - item.pop("attention_mask") - else: - actual_len = cur_len - - pad_amount = max_len - cur_len - item["seq_lens"] = [actual_len] - # seq_lens_padded must cover the full padded length so that - # cu_seqlens_padded[-1] == total_tokens in the downstream THD pipeline. - item["seq_lens_padded"] = [max_len] - item["position_ids"] = list(range(max_len)) - - if pad_amount > 0: - item["input_ids"] = list(item["input_ids"]) + [input_ids_pad] * pad_amount - item["labels"] = list(item["labels"]) + [-100] * pad_amount - - tokens = batchify(torch.stack([torch.tensor(x["input_ids"]) for x in batch])) - labels = batchify(torch.stack([torch.tensor(x["labels"]) for x in batch])) - position_ids = batchify(torch.stack([torch.tensor(x["position_ids"]) for x in batch])) - - seq_lens = batchify(torch.LongTensor(pad_within_micro([x["seq_lens"] for x in batch], -1000))) - seq_lens_padded = batchify(torch.LongTensor(pad_within_micro([x["seq_lens_padded"] for x in batch], -1000))) - - return { - "input_ids": tokens, - "labels": labels, - "position_ids": position_ids, - "seq_lens": seq_lens, - "seq_lens_padded": seq_lens_padded, - "qkv_format": "thd", - } - - -def _indexed_mask_to_4d_block_causal(attention_mask: torch.Tensor) -> torch.Tensor: - """Convert an indexed attention mask to a 4D block-causal mask. - - Args: - attention_mask: Integer tensor of shape ``[B, S]`` where each - position contains the 1-based index of the sub-sequence it - belongs to (0 = padding). - - Returns: - Bool tensor of shape ``[B, 1, S, S]`` suitable for - ``eager`` / ``sdpa`` attention backends. ``True`` means the - position is **allowed** to attend. - """ - # attention_mask: [B, S] - B, S = attention_mask.shape - - # same_doc[b, i, j] = True iff positions i and j belong to the same sub-sequence - mask_q = attention_mask.unsqueeze(2) # [B, S, 1] - mask_k = attention_mask.unsqueeze(1) # [B, 1, S] - same_doc = mask_q == mask_k # [B, S, S] - - # causal: position i can attend to position j only if j <= i - causal = torch.ones(S, S, dtype=torch.bool, device=attention_mask.device).tril() # [S, S] - - # not_padding: both positions must be non-padding (index > 0) - not_padding_q = (attention_mask > 0).unsqueeze(2) # [B, S, 1] - not_padding_k = (attention_mask > 0).unsqueeze(1) # [B, 1, S] - - mask_4d = same_doc & causal.unsqueeze(0) & not_padding_q & not_padding_k # [B, S, S] - - return mask_4d.unsqueeze(1) # [B, 1, S, S] - - -def neat_packed_collater(batch: list[dict], attn_implementation: str = "sdpa") -> dict: - """Collater for neat-packed LLM sequences. - - Stacks ``input_ids``, ``labels``, ``position_ids`` and converts the - indexed ``attention_mask`` to the format required by the attention backend. - - For ``flash_attention_2``: keeps the indexed 2D mask ``[B, S]``. - For ``sdpa`` / ``eager``: converts to a 4D block-causal float mask. - - Args: - batch: List of sample dicts produced by ``neat_pack_dataset``. - attn_implementation: Attention backend (``"flash_attention_2"``, - ``"sdpa"``, or ``"eager"``). - - Returns: - Dict with batched tensors ready for model forward. - """ - if not batch: - return {} - - input_ids = batchify(torch.stack([torch.as_tensor(x["input_ids"]) for x in batch])) - labels = batchify(torch.stack([torch.as_tensor(x["labels"]) for x in batch])) - position_ids = batchify(torch.stack([torch.as_tensor(x["position_ids"]) for x in batch])) - attention_mask = batchify(torch.stack([torch.as_tensor(x["attention_mask"]) for x in batch])) - - if attn_implementation == "flash_attention_2": - mask_out = attention_mask - else: - mask_out = _indexed_mask_to_4d_block_causal(attention_mask) - - return { - "input_ids": input_ids, - "labels": labels, - "position_ids": position_ids, - "attention_mask": mask_out, - } - - -class SFTSingleTurnPreprocessor: - """ - Generic single-turn text-to-text SFT (supervised-fine-tuning) pre-processor. - - Args: - tokenizer: Pre-trained tokenizer (HF). - """ - - def __init__(self, tokenizer): - """ - SFTSingleTurnPreprocessor constructor. - - Args: - tokenizer: Pretrained tokenizer. - """ - self.tokenizer = tokenizer - self.block_size = None - self.preprocessing_num_workers = 1 - self.overwrite_cache = False - self.pad_to_max_length = True - - def _tokenize_function(self, examples, dataset): - ctx = dataset.get_context(examples) - tgt = dataset.get_target(examples) - - ctx_tok = self.tokenizer(ctx) - tgt_tok = self.tokenizer(tgt) - - # strip trailing special token from context - if len(ctx_tok["input_ids"][0]) > 0 and ctx_tok["input_ids"][0][-1] in self.tokenizer.all_special_ids: - ctx_tok["input_ids"] = [ids[:-1] for ids in ctx_tok["input_ids"]] - ctx_tok["attention_mask"] = [m[:-1] for m in ctx_tok["attention_mask"]] - - # strip leading special token from target - if len(tgt_tok["input_ids"][0]) > 0 and tgt_tok["input_ids"][0][0] in self.tokenizer.all_special_ids: - tgt_tok["input_ids"] = [ids[1:] for ids in tgt_tok["input_ids"]] - tgt_tok["attention_mask"] = [m[1:] for m in tgt_tok["attention_mask"]] - - out = {} - out["input_ids"] = [ - c_ids + t_ids for c_ids, t_ids in zip(ctx_tok["input_ids"], tgt_tok["input_ids"], strict=False) - ] - out["attention_mask"] = [ - c_m + t_m for c_m, t_m in zip(ctx_tok["attention_mask"], tgt_tok["attention_mask"], strict=False) - ] - # label: -100 for ctx, true ids for tgt - out["labels"] = [ - [-100] * (len(c_ids) - 1) + t_ids + [-100] - for c_ids, t_ids in zip(ctx_tok["input_ids"], tgt_tok["input_ids"], strict=False) - ] - - out["loss_mask"] = [[1 if t != -100 else 0 for t in lbl] for lbl in out["labels"]] - return out - - def _compute_dataset_max_len(self, tokenized_ds): - max_len = max(map(lambda x: len(x["input_ids"]), tokenized_ds)) - # make multiple of 8 - max_len = math.ceil(max_len / 8) * 8 - # respect model block size - if self.block_size is not None: - max_len = min(max_len, self.block_size) - return max_len - - def _pad_function(self, max_len): - tk = self.tokenizer - - def _pad(examples): - pad_id = tk.pad_token_id or 0 - examples["input_ids"] = [ - (ids[:max_len] + [pad_id] * max(0, max_len - len(ids))) for ids in examples["input_ids"] - ] - examples["attention_mask"] = [ - ([1] * min(len(ids), max_len) + [0] * max(0, max_len - len(ids))) for ids in examples["attention_mask"] - ] - examples["labels"] = [(lbl[:max_len] + [-100] * max(0, max_len - len(lbl))) for lbl in examples["labels"]] - examples["loss_mask"] = [(lm[:max_len] + [0] * max(0, max_len - len(lm))) for lm in examples["loss_mask"]] - # return dictionary with sequences all exactly `max_len` long - return examples - - return _pad - - def process(self, raw_dataset, ds): - """ - Main processor entry. - - Args: - raw_dataset (datasets.DatasetDict): the dataset (e.g. returned by load_dataset) - ds (dataset): the dataset with get_target method. - - Returns: - datasets.DatasetDict: tokenized + optionally padded datasets (all splits preserved). - """ - if not hasattr(self.tokenizer, "pad_token") and hasattr(self.tokenizer, "bos_token"): - self.tokenizer.pad_token = self.tokenizer.bos_token - - # 1. tokenise - tokenized = raw_dataset.map( - lambda x: self._tokenize_function(x, dataset=ds), - batched=True, - num_proc=self.preprocessing_num_workers, - remove_columns=raw_dataset.column_names, - load_from_cache_file=not self.overwrite_cache, - desc="Running tokenizer on dataset", - ) - - # 2. pad (optional) - if self.pad_to_max_length: - # 2a. compute global max len - max_len = self._compute_dataset_max_len(tokenized) - - # 2b. pad to max len - pad_fn = self._pad_function(max_len) - tokenized = tokenized.map( - pad_fn, - batched=True, - num_proc=self.preprocessing_num_workers, - load_from_cache_file=not self.overwrite_cache, - desc=f"Padding dataset to max length {max_len}", - ) - - return tokenized - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/datasets/llm/chat_dataset.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import json -import logging -import re -from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Sequence, Union - -from datasets import VerificationMode, load_dataset -from torch.utils.data import Dataset - -from nemo_automodel.components.datasets.llm.formatting_utils import ( - _add_pad_token, - _has_chat_template, - _resolve_chat_template, - format_chat_template, -) - - -def _is_hf_repo_id(val: str) -> bool: - # Basic check: org/name without local path existing - if "/" not in val: - return False - p = Path(val) - return not p.exists() and all(part for part in val.split("/")) - - -def _as_iter(val: Union[str, Sequence[str]]) -> Iterator[str]: - if isinstance(val, str): - yield val - else: - for x in val: - if not isinstance(x, str): - raise ValueError("data_files entries must be strings") - yield x - - -_SPLIT_SLICE_RE = re.compile(r"^(\w+)\[(\d*):(\d*)\]$") - - -def _parse_split_slice(split: Optional[str]): - """Parse a split string like ``"train[1024:]"`` into ``(base_split, slice | None)``.""" - if split is None: - return split, None - match = _SPLIT_SLICE_RE.match(split) - if not match: - return split, None - base = match.group(1) - start = int(match.group(2)) if match.group(2) else None - end = int(match.group(3)) if match.group(3) else None - return base, slice(start, end) - - -def _load_openai_messages( - path_or_dataset_id: Union[str, Sequence[str]], - split: Optional[str] = None, - name: Optional[str] = None, - shuffle_seed: Optional[int] = None, - skip_invalid_samples: bool = False, -): - """Load OpenAI chat messages datasets from HF or local JSON/JSONL files. - - For HF repo IDs, we delegate to datasets.load_dataset. When *split* - is provided, the full base split is loaded and shuffled *before* any - slice (e.g. ``[1024:]``) is applied so that train/val splits sample - from a consistent random order. When *split* is ``None`` it is passed - through to ``load_dataset`` as-is (no default override). - - For local files, we manually parse JSONL/JSON to avoid pyarrow type - inference issues (e.g., heterogeneous field types under `tools`). - - Args: - path_or_dataset_id: HF dataset ID or local file path(s). - split: Dataset split to load (e.g., "train", "train[1024:]"). - name: Dataset configuration/subset name - shuffle_seed: Random seed for shuffling HF datasets before slicing. - Set to ``None`` to disable shuffling. - skip_invalid_samples: If ``True``, skip malformed JSONL lines for local - files instead of failing fast. - """ - if isinstance(path_or_dataset_id, str) and _is_hf_repo_id(path_or_dataset_id): - base_split, sl = _parse_split_slice(split) - - dataset = load_dataset( - path_or_dataset_id, - name=name, - split=base_split, - streaming=False, - verification_mode=VerificationMode.NO_CHECKS, - ) - if shuffle_seed is not None: - dataset = dataset.shuffle(seed=shuffle_seed) - - if sl is not None: - indices = range(*sl.indices(len(dataset))) - dataset = dataset.select(indices) - - return dataset - - # Handle local directories and Parquet files via load_dataset. - # This covers pre-filtered cached datasets saved as Parquet. - if isinstance(path_or_dataset_id, str): - p = Path(path_or_dataset_id) - is_parquet_file = p.is_file() and p.suffix.lower() == ".parquet" - is_dataset_dir = p.is_dir() and any(p.glob("*.parquet")) - - if is_parquet_file or is_dataset_dir: - logging.getLogger(__name__).info("Loading local dataset from %s via load_dataset", path_or_dataset_id) - base_split, sl = _parse_split_slice(split) - - load_path = str(p.parent) if is_parquet_file else str(p) - # Cached Parquet datasets (from prefilter_dataset.py) are saved as a single - # split. Default to "train" when split is unspecified or was stripped to - # extract a slice (e.g. "train[:128]" → base_split="train", sl=slice(None,128)). - dataset = load_dataset( - load_path, - name=name, - split=base_split or "train", - data_files=p.name if is_parquet_file else None, - verification_mode=VerificationMode.NO_CHECKS, - ) - - if shuffle_seed is not None: - dataset = dataset.shuffle(seed=shuffle_seed) - if sl is not None: - indices = range(*sl.indices(len(dataset))) - dataset = dataset.select(indices) - return dataset - - # Fall back to manual JSON/JSONL parsing for local files. - files = list(_as_iter(path_or_dataset_id)) - if not files: - raise RuntimeError("No data files provided") - - rows: List[Dict[str, Any]] = [] - - def _read_file(fp: str) -> None: - p = Path(fp) - if not p.exists(): - raise FileNotFoundError(f"File not found: {fp}") - text = p.read_text(encoding="utf-8") - if p.suffix.lower() in {".jsonl", ".ndjson"}: - skipped_lines = 0 - for line in text.splitlines(): - line = line.strip() - if not line: - continue - try: - rows.append(json.loads(line)) - except json.JSONDecodeError: - if not skip_invalid_samples: - raise - skipped_lines += 1 - if skipped_lines: - logging.getLogger(__name__).warning( - "Skipped %d malformed JSONL line(s) from %s (skip_invalid_samples=True)", - skipped_lines, - fp, - ) - else: - obj = json.loads(text) - if isinstance(obj, list): - rows.extend(obj) - else: - rows.append(obj) - - for f in files: - _read_file(f) - - return rows - - -def _normalize_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """Ensure messages list is valid and content fields are strings for system/user/assistant. - - - Keeps tool_calling fields if present (e.g., tool calls in assistant messages, tool role messages). - - If content is a list of parts, only keep text parts. - """ - - def _normalize_content(value: Any) -> str: - if isinstance(value, list): - return " ".join(part["text"] for part in value if isinstance(part, dict) and "text" in part) - if value is None: - return "" - return str(value) - - def _normalize_tool_calls(tool_calls: Any) -> List[Dict[str, Any]]: - if not isinstance(tool_calls, list): - raise ValueError("assistant message `tool_calls` must be a list") - - normalized_tool_calls: List[Dict[str, Any]] = [] - for idx, tool_call in enumerate(tool_calls): - if not isinstance(tool_call, dict): - raise ValueError(f"assistant message `tool_calls[{idx}]` must be a dict") - - tool_call_id = tool_call.get("id") - if not isinstance(tool_call_id, str) or not tool_call_id: - raise ValueError(f"assistant message `tool_calls[{idx}].id` must be a non-empty string") - - tool_call_type = tool_call.get("type") - if not isinstance(tool_call_type, str) or not tool_call_type: - raise ValueError(f"assistant message `tool_calls[{idx}].type` must be a non-empty string") - - function = tool_call.get("function") - if not isinstance(function, dict): - raise ValueError(f"assistant message `tool_calls[{idx}].function` must be a dict") - - function_name = function.get("name") - if not isinstance(function_name, str) or not function_name: - raise ValueError(f"assistant message `tool_calls[{idx}].function.name` must be a non-empty string") - - function_arguments = function.get("arguments") - if function_arguments is None: - raise ValueError(f"assistant message `tool_calls[{idx}].function.arguments` is required") - - normalized_function = dict(function) - if not isinstance(function_arguments, str): - normalized_function["arguments"] = json.dumps(function_arguments) - - normalized_tool_call = dict(tool_call) - normalized_tool_call["function"] = normalized_function - normalized_tool_calls.append(normalized_tool_call) - - return normalized_tool_calls - - norm: List[Dict[str, Any]] = [] - for m in messages: - role = m.get("role") - out = dict(m) - if role not in {"system", "user", "assistant", "tool"}: - raise ValueError(f"Unsupported role in messages: {role}") - - out["content"] = _normalize_content(m.get("content")) - - if role == "assistant": - if "reasoning_content" in m: - reasoning_content = m.get("reasoning_content") - if reasoning_content is None: - out["reasoning_content"] = "" - else: - if not isinstance(reasoning_content, str): - raise ValueError("assistant message `reasoning_content` must be a string when provided") - out["reasoning_content"] = reasoning_content - if "tool_calls" in m: - out["tool_calls"] = _normalize_tool_calls(m.get("tool_calls")) - - if role == "tool": - tool_call_id = m.get("tool_call_id") - if not isinstance(tool_call_id, str) or not tool_call_id: - raise ValueError("tool message `tool_call_id` must be a non-empty string") - - norm.append(out) - return norm - - -class ChatDataset(Dataset): - """Dataset for OpenAI-format tool-calling chat transcripts. - - This class expects each row to contain a `messages` list in OpenAI chat format, - potentially including tool calls and tool responses. The datasetformats the - conversation via the tokenizer's chat template to produce `input_ids`, `labels`, - and `attention_mask` suitable for SFT. - """ - - def __init__( - self, - path_or_dataset_id: Union[str, Sequence[str]], - tokenizer, - *, - split: Optional[str] = None, - name: Optional[str] = None, - seq_length: Optional[int] = None, - padding: Union[str, bool] = "do_not_pad", - truncation: Union[str, bool] = "do_not_truncate", - start_of_turn_token: Optional[str] = None, - chat_template: Optional[str] = None, - shuffle_seed: Optional[int] = None, - mask_reasoning_content: bool = False, - unshifted: bool = False, - skip_invalid_samples: bool = False, - ) -> None: - """Load OpenAI-format chat rows and tokenize via the chat template. - - Args: - path_or_dataset_id: Hugging Face dataset id, local JSON/JSONL path(s), Parquet file, or Parquet directory. - tokenizer: Tokenizer with chat template support (required). - split: Dataset split or slice (e.g. ``train``, ``train[1024:]``). - name: Optional Hub subset / config name. - seq_length: Maximum sequence length for padding and truncation in formatting. - padding: Padding mode for ``format_chat_template``. - truncation: Truncation mode for ``format_chat_template``. - start_of_turn_token: Optional token marking assistant turns for answer-only loss. - chat_template: Optional Jinja template string overriding ``tokenizer.chat_template``. - shuffle_seed: If set, shuffles Hub/Parquet data before applying a split slice. - mask_reasoning_content: If ``True``, exclude rendered reasoning traces from the loss mask. - unshifted: Passed through to ``format_chat_template``. - skip_invalid_samples: If ``True``, skip malformed JSONL lines when reading local files (warning logs - include skip counts). If ``False``, a bad line raises. Does not skip invalid structured rows after - load; those still raise when a sample is accessed. - """ - if tokenizer is None: - raise ValueError("Tokenizer is required") - - # Enforce chat-template availability for tool-calling data - if chat_template is not None: - tokenizer.chat_template = _resolve_chat_template(chat_template) - - if not _has_chat_template(tokenizer): - raise ValueError("ChatDataset requires a tokenizer with chat template support.") - - self.tokenizer = tokenizer - self.seq_length = seq_length - self.padding = padding - self.truncation = truncation - self.start_of_turn_token = start_of_turn_token - self.mask_reasoning_content = mask_reasoning_content - self.unshifted = unshifted - self.skip_invalid_samples = skip_invalid_samples - - self.dataset = _load_openai_messages( - path_or_dataset_id, - split=split, - name=name, - shuffle_seed=shuffle_seed, - skip_invalid_samples=skip_invalid_samples, - ) - - # Ensure pad token presence for downstream padding - eos_token_id = getattr(self.tokenizer, "eos_token_id", 0) - self.pad_token_id = _add_pad_token(self.tokenizer) or eos_token_id - - def __len__(self) -> int: - return len(self.dataset) - - def __getitem__(self, idx: int) -> Dict[str, List[int]]: - row = self.dataset[idx] - messages = row.get("messages") - if not isinstance(messages, list): - raise ValueError("Each sample must contain a `messages` list in OpenAI format") - - normalized = _normalize_messages(messages) - tools = row.get("tools") - if tools is not None and not isinstance(tools, list): - tools = None - - eos_token_id = getattr(self.tokenizer, "eos_token_id", 0) - sample = format_chat_template( - self.tokenizer, - normalized, - eos_token_id, - self.pad_token_id, - seq_length=self.seq_length, - padding=self.padding, - truncation=self.truncation, - tools=tools, - mask_reasoning_content=self.mask_reasoning_content, - unshifted=self.unshifted, - ) - return sample - -``` - -File: /Users/mromeijn/src/Automodel/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import re -from enum import Enum -from pathlib import Path -from typing import Dict, Iterator, List, Optional, Union - -from datasets import VerificationMode, load_dataset -from torch.utils.data import Dataset - -from nemo_automodel.components.datasets.llm.formatting_utils import ( - _add_pad_token, - _has_chat_template, - format_chat_template, - format_prompt_completion, -) - -logger = logging.getLogger(__name__) - -# Supported cases: -# Format: -# - Context + question + answer -# - Question + answer -# Input types: -# - one or more paths to jsonl files -# - dataset id from huggingface. - - -class ColumnTypes(Enum): - Context = "context" - Question = "question" - Answer = "answer" - - -def make_iterable(val: Union[str, List[str]]) -> Iterator[str]: - """Utility that converts *val* into an iterator of strings. - - The helper accepts either a single string or a list of strings and - yields its contents. This is handy when we want to treat the two cases - uniformly downstream (e.g. when iterating over *data_files* that can be - provided as either a single path or a collection of paths). - - Args: - val: Either a single string or a list/tuple of strings. - - Yields: - str: The individual strings contained in *val*. - - Raises: - ValueError: If *val* is neither a string nor an iterable of strings. - """ - if isinstance(val, str): - yield val - elif isinstance(val, (list, tuple)): - for item in val: - if not isinstance(item, str): - raise ValueError("All elements must be strings") - yield item - else: - raise ValueError(f"Expected str or list[str], got {type(val)}") - - -def _str_is_hf_repo_id(val: str) -> bool: - """ - Check if a string is a valid huggingface dataset id. - - Args: - val: A string to check. - - Returns: - True if the string is a valid huggingface dataset id, False otherwise. - """ - return re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", val) is not None and not Path(val).exists() - - -def _load_dataset( - path_or_dataset_id: Union[str, List[str]], - split: Optional[str] = None, - streaming: bool = False, - name: Optional[str] = None, -): - """Load a dataset either from the Hugging Face Hub or from local JSON/JSONL files. - - If *path_or_dataset_id* resembles a HF repo ID (i.e. of the form - ``org/dataset`` and the path does **not** exist on the local filesystem), - we defer to ``datasets.load_dataset`` directly. Otherwise, we assume the - argument points to one or more local JSON/JSONL files and let - ``datasets.load_dataset`` with the *"json"* script handle the parsing. - - Args: - path_or_dataset_id: Either a HF dataset identifier (``org/name``) or - a path / list of paths to local ``.json`` / ``.jsonl`` files. - split: Optional split to load when retrieving a remote dataset. This - parameter is ignored for local files as the *json* script always - returns a single split. - streaming: Whether to stream the dataset. - name: Optional name of the dataset configuration/subset to load - - Returns: - datasets.Dataset: The loaded dataset. - """ - if isinstance(path_or_dataset_id, str) and _str_is_hf_repo_id(path_or_dataset_id): - return load_dataset( - path_or_dataset_id, - name=name, - split=split, - streaming=streaming, - verification_mode=VerificationMode.NO_CHECKS, - ) - - data_files = list(make_iterable(path_or_dataset_id)) - if not data_files: - raise RuntimeError("No data files provided") - - return load_dataset( - "json", data_files=data_files, split="train", streaming=streaming, verification_mode=VerificationMode.NO_CHECKS - ) - - -def _check_all_values_equal_length(sample: Dict[str, List[int]]) -> bool: - """ - Check if all values in the sample are of the same length. - """ - len0 = len(sample[next(iter(sample))]) - all_equal = True - for k, v in sample.items(): - if k == "___PAD_TOKEN_IDS___": - continue - if len(v) != len0: - all_equal = False - break - return all_equal - - -class ColumnMappedTextInstructionDataset(Dataset): - """Generic instruction-tuning dataset that maps arbitrary column names. - - The class is intentionally lightweight: it simply loads the raw samples - (either from HF or from local JSON/JSONL files) and remaps the columns so - that downstream components can rely on a consistent field interface. - - Optionally, if *answer_only_loss_mask* is requested, the dataset will also - compute a *loss_mask* indicating which tokens should contribute to the - loss (typically only those belonging to the assistant answer). - """ - - def __init__( - self, - path_or_dataset_id: Union[str, List[str]], - column_mapping: Dict[str, str], - tokenizer, - *, - split: Optional[str] = "train", - name: Optional[str] = None, - answer_only_loss_mask: bool = True, - seq_length: Optional[int] = None, - padding: Union[str, bool] = "do_not_pad", - truncation: Union[str, bool] = "do_not_truncate", - limit_dataset_samples: Optional[int] = None, - use_hf_chat_template: bool = False, - ) -> None: - """ - Initialize the dataset. - - Args: - path_or_dataset_id: The path or dataset id of the dataset. - column_mapping: The mapping of the columns. - tokenizer: The tokenizer to use. - split: The split of the dataset to load. - name: The name of the dataset configuration/subset to load - answer_only_loss_mask: Whether to compute the loss mask only on the answer tokens. - seq_length: The sequence length to use for padding. - limit_dataset_samples: The number of samples to load from the dataset. - """ - - if use_hf_chat_template and _has_chat_template(tokenizer): - if not answer_only_loss_mask: - logging.warning( - "answer_only_loss_mask=False but tokenizer has chat template. Consider providing `answer_only_loss_mask`." - ) - - assert tokenizer is not None, "Tokenizer is required" - self.tokenizer = tokenizer - if getattr(self.tokenizer, "pad_token", None) is None: - if hasattr(self.tokenizer, "eos_token"): - self.tokenizer.pad_token = self.tokenizer.eos_token - else: - logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.") - self.tokenizer.pad_token = " " - - self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name) - - if limit_dataset_samples is not None: - self.dataset = self.dataset.select(range(limit_dataset_samples)) - - # Keep mapping: dest -> source (i.e. public_field -> raw_column_name) - - assert isinstance(column_mapping, dict), "Expected column_mapping to be a dictionary" - # Ensure required columns are present - assert ColumnTypes.Answer.value in column_mapping, ("Expected answer to be in column_mapping", column_mapping) - if len(column_mapping) == 3: - assert ColumnTypes.Context.value in column_mapping, ( - "Expected context to be in column_mapping", - column_mapping, - ) - assert ColumnTypes.Question.value in column_mapping, ( - "Expected question to be in column_mapping", - column_mapping, - ) - elif len(column_mapping) == 2: - assert ColumnTypes.Context.value in column_mapping or ColumnTypes.Question.value in column_mapping, ( - "Expected context or question to be in column_mapping", - column_mapping, - ) - else: - raise ValueError(f"Expected 2 or 3 columns in column_mapping, got {len(column_mapping)}") - - self.column_mapping = column_mapping - - self.answer_only_loss_mask = answer_only_loss_mask - self.seq_length = seq_length - self.padding = padding - self.truncation = truncation - self.use_hf_chat_template = use_hf_chat_template - - def __iter__(self) -> Iterator[Dict[str, List[int]]]: - for idx in range(len(self)): - yield self[idx] - - def __len__(self) -> int: # noqa: D401 - """ - Returns the length of the dataset. - - Returns: - The length of the dataset. - """ - return len(self.dataset) - - def __getitem__(self, idx): # noqa: D401 - """ - Returns the item at the given index. - - Args: - idx: The index of the item to return. - - Returns: - A dictionary with the mapped columns. - """ - n = len(self.dataset) - for _ in range(n): - row = self.dataset[idx] - mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row} - mapped = self._apply_tokenizer(mapped) - if any(label != -100 for label in mapped["labels"]): - assert _check_all_values_equal_length(mapped), "All values must be of the same length" - return mapped - idx = (idx + 1) % n - raise ValueError( - "All samples in the dataset produced labels that are entirely -100. " - "Check that the dataset and tokenizer configuration produce valid training targets." - ) - - def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]: - """ - Tokenize a mapped *sample* and compute auxiliary fields. - - If the tokenizer is provided: - - If the tokenizer supports a chat template, the dataset will be tokenized in a conversation style. - - Otherwise, the dataset will be tokenized in a simple prompt-completion style. - - Args: - sample: A dictionary with the mapped columns. - - Returns: - A dictionary with the tokenized columns. - """ - assert isinstance(sample, dict), "Expected sample to be a dictionary" - assert len(sample) >= 2, "Expected at least two columns" - context = sample.get(ColumnTypes.Context.value, None) - question = sample.get(ColumnTypes.Question.value, None) - answer = sample[ColumnTypes.Answer.value] - - eos_token_id = getattr(self.tokenizer, "eos_token_id", 0) - pad_token_id = _add_pad_token(self.tokenizer) or eos_token_id - - if self.use_hf_chat_template and _has_chat_template(self.tokenizer): - formatted_text = [ - {"role": "system", "content": context or ""}, - {"role": "user", "content": question or ""}, - {"role": "assistant", "content": answer}, - ] - return format_chat_template( - self.tokenizer, - formatted_text, - eos_token_id, - pad_token_id, - seq_length=self.seq_length, - padding=self.padding, - truncation=self.truncation, - answer_only_loss_mask=self.answer_only_loss_mask, - ) - else: - prompt = " ".join(filter(lambda x: x is not None, (context, question, ""))) - assert len(prompt) > 1, "Expected prompt to be non-empty" - return format_prompt_completion( - self.tokenizer, - prompt, - answer, - eos_token_id, - pad_token_id, - seq_length=self.seq_length, - padding=self.padding, - truncation=self.truncation, - answer_only_loss_mask=self.answer_only_loss_mask, - ) - -``` - -File: /Users/mromeijn/src/Automodel/docs/guides/llm/finetune.md -```md -# Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT) - -## Introduction - -Pretrained language models are general-purpose: they know a lot about language but nothing about your particular domain, terminology, or task. Fine-tuning bridges that gap — you fine-tune the model on your own examples so it produces answers that are accurate and relevant for your use case, without the cost of training a model from scratch. The result is a model optimized for your data that you can evaluate, publish, and deploy. This guide walks you through that process end-to-end with NeMo AutoModel — from installation through training, evaluation, and deployment — using [Meta LLaMA 3.2 1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and the [SQuAD v1.1](https://huggingface.co/datasets/rajpurkar/squad) dataset as a running example. - -NeMo AutoModel supports two fine-tuning modes: - -- **Supervised Fine-Tuning (SFT)** updates all model parameters. Use SFT when you need maximum accuracy and have sufficient compute. -- **Parameter-Efficient Fine-Tuning (PEFT)** using [LoRA](https://arxiv.org/abs/2106.09685) freezes the base model and trains small low-rank adapters. PEFT reduces trainable parameters to less than 1% of the original model, lowering memory and storage costs. - -### Workflow Overview - -```text -┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ -│ 1. Install │--->│ 2. Configure │--->│ 3. Train │--->│ 4. Inference │--->│ 5. Evaluate │--->│ 6. Publish │--->│ 7. Deploy │ -│ │ │ │ │ │ │ │ │ │ │ (optional) │ │ (optional) │ -│ pip install │ │ YAML config │ │ automodel CLI│ │ HF generate │ │ Val loss + │ │ HF Hub │ │ vLLM serving │ -│ or Docker │ │ Choose SFT │ │ or torchrun │ │ API │ │ lm-eval- │ │ upload │ │ │ -│ │ │ or PEFT │ │ │ │ │ │ harness │ │ │ │ │ -└──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ -``` - -| Step | Section | SFT | PEFT | -|------|---------|-----|------| -| **1. Install** | [Install NeMo AutoModel](#install-nemo-automodel) | Same | Same | -| **2. Configure** | [Configure Your Training Recipe](#configure-your-training-recipe) | YAML without `peft:` section | YAML with `peft:` section | -| **3. Train** | [Fine-Tune the Model](#fine-tune-the-model) | Same command for both modes | Same command for both modes | -| **4. Inference** | [Run Inference](#run-inference) | Load consolidated checkpoint directly | Load base model + adapter | -| **5. Evaluate** | [Evaluate the Fine-Tuned Model](#evaluate-the-fine-tuned-model) | Validation loss during training; lm-eval-harness post-training | Same | -| **6. Publish** | [Publish to HF Hub](#publish-to-the-hugging-face-hub) | Upload `model/consolidated/` | Upload `model/` (adapter only) | -| **7. Deploy** | [Deploy with vLLM](#deploy-with-vllm) | `vllm.LLM(model=...)` | `vLLMHFExporter` with `--lora-model` | - -## Install NeMo AutoModel - -```bash -pip3 install nemo-automodel -``` - -Alternatively, if you run into dependency or driver issues, use the pre-built Docker container: - -```bash -docker pull nvcr.io/nvidia/nemo-automodel:26.02.00 -docker run --gpus all -it --rm --shm-size=8g -v $(pwd)/checkpoints:/tmp/checkpoints/ nvcr.io/nvidia/nemo-automodel:26.02.00 -``` - -:::{important} -Docker containers are ephemeral — files written inside the container are lost when it stops. The `-v` flag in the `docker run` command above bind-mounts a local `checkpoints/` directory into the container so that saved checkpoints persist across runs. For more details, see [Saving Checkpoints When Using Docker](../checkpointing.md#saving-checkpoints-when-using-docker). -::: - -For the full set of installation methods, see the [installation guide](../installation.md). - -## Configure Your Training Recipe - - -Training is configured through a [YAML](https://en.wikipedia.org/wiki/YAML) config file with three required sections — **model**, **dataset**, and **step_scheduler** — plus an optional **peft** section. The sections below walk through each one. For the complete copy-pastable file, see [Full Config YAML](#full-config-yaml). - -Under the hood, both SFT and PEFT are executed by a **recipe**: a self-contained Python class that wires together model loading, dataset preparation, training, checkpointing, and logging. The fine-tuning recipe is [`TrainFinetuneRecipeForNextTokenPrediction`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py). The config file tells the recipe *what* to build; the recipe decides *how* to build it. - -:::{dropdown} How the Config System Works - -NeMo AutoModel configs use a convention borrowed from [Hydra](https://hydra.cc/): the special **`_target_`** key tells the framework *which* Python class or function to call, and **every other key** in the same YAML block is passed as a keyword argument to that call. For example: - -```yaml -optimizer: - _target_: torch.optim.Adam - lr: 1.0e-5 - weight_decay: 0 -``` - -is equivalent to writing this Python code: - -```python -from torch.optim import Adam - -optimizer = Adam(lr=1.0e-5, weight_decay=0) -``` - -The `_target_` value is a **dotted Python import path**: the same string you would use in an `import` statement. The framework resolves it at runtime by importing the module and looking up the attribute. This means you can point `_target_` at any class constructor or factory function, and the remaining keys become its arguments. - -:::{tip} -To discover which parameters a section accepts, look up the Python signature of its `_target_`. For instance, `torch.optim.Adam` accepts `lr`, `betas`, `eps`, and `weight_decay` — those are the keys you can set in the YAML. -::: - -**From YAML to running code.** Here is the path a config takes through the framework: - -```text -finetune_config.yaml - │ - ▼ - ┌──────────────┐ load_yaml_config() parses the file into - │ ConfigNode │◄─── a tree of ConfigNode objects, one per - └──────┬───────┘ YAML section. - │ - ▼ - ┌──────────────┐ The recipe's setup() method reads - │ Recipe │◄─── each section from the ConfigNode tree - │ setup() │ and passes it to the matching builder. - └──────┬───────┘ - │ - ┌────┴─────────────────────────────────┐ - ▼ ▼ ▼ ▼ -build_model build_optimizer build_dataloader build_loss_fn ... - │ │ │ │ - ▼ ▼ ▼ ▼ -cfg.model cfg.optimizer cfg.dataset cfg.loss_fn - .instantiate() .instantiate() .instantiate() .instantiate() - │ │ │ │ - ▼ ▼ ▼ ▼ - Resolves Resolves Resolves Resolves - _target_, _target_, _target_, _target_, - calls it calls it calls it calls it - with kwargs with kwargs with kwargs with kwargs -``` - -Each builder function calls **`.instantiate()`** on its config section. `.instantiate()` does two things: - -1. **Resolves `_target_`** — imports the Python path and obtains the callable (class or function). -2. **Calls it** — passes every other key in the section as a keyword argument. - -Nested `_target_` blocks (like `collate_fn` inside `dataloader`) are recursively instantiated the same way. - -**The `recipe` key.** Every config file includes a top-level `recipe` key that tells the CLI *which recipe class* to run. You can write it as a **short name** or as a **fully-qualified Python path** — both resolve to the same class: - -```yaml -# Short name (the CLI looks up the class automatically) -recipe: TrainFinetuneRecipeForNextTokenPrediction - -# Fully-qualified path (used as-is) -recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction -``` - -The short name form is a convenience — the CLI scans all recipe modules under `nemo_automodel.recipes` and matches the bare class name. If you invoke the recipe script directly with `torchrun` instead of the `automodel` CLI, the `recipe` key is not required because the script itself *is* the recipe. - -**Not every section uses `_target_`.** Some sections like `step_scheduler`, `distributed`, and `checkpoint` are plain key-value groups consumed directly by the recipe — they control training schedule, parallelism strategy, and checkpoint behavior without instantiating a Python object. -::: - -### Model - -```yaml -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B -``` - -| Key | Role | -|-----|------| -| `_target_` | Points to [`NeMoAutoModelForCausalLM.from_pretrained`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/_transformers/auto_model.py) — a factory method that downloads (or loads from cache) a pretrained Hugging Face model and wraps it with NeMo distributed-training support. | -| `pretrained_model_name_or_path` | A keyword argument to `from_pretrained`. Any argument that [`from_pretrained`](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained) accepts can be added here (e.g. `cache_dir`, `torch_dtype`). | - -This guide uses **Meta Llama 3.2 1B** as a running example. Replace `pretrained_model_name_or_path` with any supported [Hugging Face model ID](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/model-coverage/llm.md). - -:::{dropdown} About Llama 3.2 1B -Llama is a family of decoder-only transformer models developed by Meta. The 1B variant is a compact model suitable for research and edge deployment, featuring RoPE positional embeddings, grouped-query attention (GQA), and SwiGLU activations. -::: - -:::{dropdown} Accessing gated models -Some Hugging Face models are **gated**. If the model page shows a "Request access" button: - -1. Log in with your Hugging Face account and accept the license. -2. Ensure the token you use (from `huggingface-cli login` or `HF_TOKEN`) belongs to the approved account. - -Pulling a gated model without an authorized token triggers a 403 error. -::: - -### Dataset - -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad # HF-Hub ID used to pull the dataset - split: train - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation -``` - -| Key | Role | -|-----|------| -| `_target_` | Points to [`make_squad_dataset`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py) — a factory function that downloads the SQuAD dataset, tokenizes it, and returns a `torch.utils.data.Dataset`. To use a different dataset, change `_target_` to a different factory function (see [Integrate Your Own Text Dataset](dataset.md)). | -| `dataset_name`, `split` | Keyword arguments passed to `make_squad_dataset`. Each dataset factory defines its own parameters — check the function signature to see what's available. | - -This guide uses **SQuAD v1.1** as a running example. Swap the dataset by changing `_target_` and the dataset arguments — see [Integrate Your Own Text Dataset](dataset.md) and [Dataset Overview](../dataset-overview.md). - -:::{dropdown} About SQuAD v1.1 -The Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset where each example consists of a Wikipedia passage, a question, and a span answer. SQuAD v1.1 guarantees all questions are answerable from the context, making it suitable for straightforward fine-tuning. - -Example: -```json -{ - "context": "Architecturally, the school has a Catholic character. ...", - "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?", - "answers": { "text": ["Saint Bernadette Soubirous"], "answer_start": [515] } -} -``` -::: - -### PEFT (Optional) - -```yaml -peft: - _target_: nemo_automodel.components._peft.lora.PeftConfig - target_modules: "*.proj" # glob pattern matching linear layer FQNs - dim: 8 # low-rank dimension of the adapters - alpha: 32 # scaling factor for learned weights -``` - -| Key | Role | -|-----|------| -| `_target_` | Points to [`PeftConfig`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/_peft/lora.py) — a dataclass that describes which layers to adapt and how. Unlike the model and dataset sections, this instantiation produces a *config object*, not the adapter itself. The recipe passes the resulting `PeftConfig` into `build_model`, which applies LoRA adapters to the model. | -| `target_modules` | A glob pattern matched against fully-qualified layer names (e.g. `"*.proj"` matches every layer whose name ends in `proj`). | -| `dim` | The low-rank dimension *r* — controls adapter capacity. Larger values learn more but use more memory. | -| `alpha` | Scaling factor applied to the adapter output (`alpha / dim`). Higher values give adapters more influence during training. | - -Including a `peft:` section enables LoRA fine-tuning. Remove it entirely to run SFT instead — see [Switching Between SFT and PEFT](#switching-between-sft-and-peft). - -#### QLoRA (Quantized Low-Rank Adaptation) - -If GPU memory is a constraint, [QLoRA](https://arxiv.org/abs/2305.14314) combines LoRA with 4-bit NormalFloat (NF4) quantization to reduce memory usage by up to 75% compared to full-parameter SFT in 16-bit precision, while maintaining comparable quality to standard LoRA. - -To enable QLoRA, add a `quantization:` section alongside the `peft:` section in your config. Note two differences from the standard PEFT config above: `target_modules` uses the broader `"*_proj"` pattern to apply LoRA to all projection layers (wider coverage compensates for precision loss from 4-bit weights), and `dim` is increased from 8 to 16 for additional adapter capacity. - -```yaml -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -peft: - _target_: nemo_automodel.components._peft.lora.PeftConfig - target_modules: "*_proj" # broader glob than "*.proj" to cover all projection layers - dim: 16 # LoRA rank (higher than default to offset quantization) - alpha: 32 # scaling factor - dropout: 0.1 # LoRA dropout rate - -quantization: - load_in_4bit: True # enable 4-bit quantization - load_in_8bit: False # use 4-bit, not 8-bit - bnb_4bit_compute_dtype: bfloat16 # compute dtype - bnb_4bit_use_double_quant: True # double quantization for extra savings - bnb_4bit_quant_type: nf4 # NormalFloat quantization type - bnb_4bit_quant_storage: bfloat16 # storage dtype for quantized weights -``` - -### Training Schedule - -```yaml -step_scheduler: - num_epochs: 1 # Will train over the dataset once. -``` - -Unlike the sections above, `step_scheduler` has **no `_target_`** — it is not instantiated into a Python object. Instead, the recipe reads its keys directly to control the training loop (how many epochs to run, when to checkpoint, when to validate). This is typical of sections that configure *behavior* rather than *components*. - -All other settings (distributed strategy, optimizer, checkpointing, logging) use sensible defaults. See the [Full Configuration Reference](#full-configuration-reference) to customize them. - -### Full Config YAML - -:::{dropdown} finetune_config.yaml (click to expand) -Save as `finetune_config.yaml`. This config runs PEFT (LoRA). To run SFT instead, remove the `peft:` section. For production-ready examples, see the hosted configs: [Llama 3.2 1B SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) and [Llama 3.2 1B PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml). - -```yaml -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -peft: - _target_: nemo_automodel.components._peft.lora.PeftConfig - target_modules: "*.proj" - dim: 8 - alpha: 32 - -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: train - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation - -step_scheduler: - num_epochs: 1 -``` -::: - -## Fine-Tune the Model - -You can run the recipe using the AutoModel CLI or directly with `torchrun` (advanced). - -```bash -automodel --nproc-per-node=8 finetune_config.yaml -``` - -The `--nproc-per-node=8` flag specifies the number of GPUs per node. Adjust to your case (for a single GPU, omit the `--nproc-per-node` option). - -### Invoke the Recipe Script Directly (advanced) - -Alternatively, you can invoke the recipe [script](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py) directly using [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html), as shown below. - -``` bash -torchrun --nproc-per-node=8 nemo_automodel/recipes/llm/train_ft.py -c finetune_config.yaml -``` - -### Sample Output -Running the recipe using either the `automodel` app or by directly invoking the recipe script should produce -the following log: -``` -$ automodel finetune_config.yaml -INFO:nemo_automodel.cli.app:Config: finetune_config.yaml -INFO:nemo_automodel.cli.app:Recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction -INFO:nemo_automodel.cli.app:Launching job interactively (local) -cfg-path: finetune_config.yaml -INFO:root:step 4 | epoch 0 | loss 1.5514 | grad_norm 102.0000 | mem: 11.66 GiB | tps 6924.50 -INFO:root:step 8 | epoch 0 | loss 0.7913 | grad_norm 46.2500 | mem: 14.58 GiB | tps 9328.79 -Saving checkpoint to checkpoints/epoch_0_step_10 -INFO:root:step 12 | epoch 0 | loss 0.4358 | grad_norm 23.8750 | mem: 15.48 GiB | tps 9068.99 -INFO:root:step 16 | epoch 0 | loss 0.2057 | grad_norm 12.9375 | mem: 16.47 GiB | tps 9148.28 -INFO:root:step 20 | epoch 0 | loss 0.2557 | grad_norm 13.4375 | mem: 12.35 GiB | tps 9196.97 -Saving checkpoint to checkpoints/epoch_0_step_20 -INFO:root:[val] step 20 | epoch 0 | loss 0.2469 -``` - -Each log line reports the current loss, gradient norm, peak GPU memory, and tokens per second (TPS). Small fluctuations between steps (e.g., 0.2057 to 0.2557 above) are normal — look at the overall downward trend rather than individual values. - -### Checkpoint Contents - -Checkpoints are saved in native Hugging Face format, so no conversion is required — they work directly with Transformers, PEFT, vLLM, lm-eval-harness, and other tools in the Hugging Face ecosystem. SFT and PEFT produce different checkpoint layouts. **SFT checkpoints** contain the full model weights at `model/consolidated/` — a single, self-contained Hugging Face model directory created by gathering distributed shards into one location — and can be loaded directly. **PEFT checkpoints** contain only the adapter weights (~MBs instead of GBs) — at inference time you must load the original base model and apply the adapter on top. This distinction affects every downstream step (inference, publishing, deployment). - -:::{dropdown} Checkpoint directory structure -**SFT checkpoint:** -```bash -$ tree checkpoints/epoch_0_step_10/ -checkpoints/epoch_0_step_10/ -├── config.yaml -├── dataloader.pt -├── model -│ ├── consolidated -│ │ ├── config.json -│ │ ├── model-00001-of-00001.safetensors -│ │ ├── model.safetensors.index.json -│ │ ├── special_tokens_map.json -│ │ ├── tokenizer.json -│ │ ├── tokenizer_config.json -│ │ └── generation_config.json -│ ├── shard-00001-model-00001-of-00001.safetensors -│ └── shard-00002-model-00001-of-00001.safetensors -├── optim -│ ├── __0_0.distcp -│ └── __1_0.distcp -├── rng.pt -└── step_scheduler.pt - -4 directories, 11 files -``` - -**PEFT checkpoint:** -```bash -$ tree checkpoints/epoch_0_step_10/ -checkpoints/epoch_0_step_10/ -├── dataloader.pt -├── config.yaml -├── model -│ ├── adapter_config.json -│ ├── adapter_model.safetensors -│ └── automodel_peft_config.json -├── optim -│ ├── __0_0.distcp -│ └── __1_0.distcp -├── rng.pt -└── step_scheduler.pt - -2 directories, 8 files -``` -::: - -## Run Inference - -Inference uses the Hugging Face `generate` API. Because SFT checkpoints are self-contained while PEFT checkpoints store only adapter weights (see [Checkpoint Contents](#checkpoint-contents)), the loading procedure differs between the two modes. - -### SFT Inference - -The SFT checkpoint at `model/consolidated/` is a complete Hugging Face model and can be loaded directly: - -```python -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer - -ckpt_path = "checkpoints/epoch_0_step_10/model/consolidated" -tokenizer = AutoTokenizer.from_pretrained(ckpt_path) -model = AutoModelForCausalLM.from_pretrained(ckpt_path) - -device = "cuda" if torch.cuda.is_available() else "cpu" -model.to(device) - -prompt = ( - "Context: Architecturally, the school has a Catholic character. " - "Atop the Main Building's gold dome is a golden statue of the Virgin Mary. " - "Immediately in front of the Main Building and facing it, is a copper statue of Christ " - "with arms upraised with the legend 'Venite Ad Me Omnes'.\n\n" - "Question: What is atop the Main Building?\n\n" - "Answer:" -) -inputs = tokenizer(prompt, return_tensors="pt").to(device) -output = model.generate(**inputs, max_new_tokens=50) -print(tokenizer.decode(output[0], skip_special_tokens=True)) -``` - -### PEFT Inference - -PEFT adapters must be loaded on top of the base model: - -```python -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer -from peft import PeftModel - -base_model_name = "meta-llama/Llama-3.2-1B" -tokenizer = AutoTokenizer.from_pretrained(base_model_name) -model = AutoModelForCausalLM.from_pretrained(base_model_name) - -adapter_path = "checkpoints/epoch_0_step_10/model/" -model = PeftModel.from_pretrained(model, adapter_path) - -device = "cuda" if torch.cuda.is_available() else "cpu" -model.to(device) - -prompt = ( - "Context: Architecturally, the school has a Catholic character. " - "Atop the Main Building's gold dome is a golden statue of the Virgin Mary. " - "Immediately in front of the Main Building and facing it, is a copper statue of Christ " - "with arms upraised with the legend 'Venite Ad Me Omnes'.\n\n" - "Question: What is atop the Main Building?\n\n" - "Answer:" -) -inputs = tokenizer(prompt, return_tensors="pt").to(device) -output = model.generate(**inputs, max_new_tokens=50) -print(tokenizer.decode(output[0], skip_special_tokens=True)) -``` - -## Evaluate the Fine-Tuned Model - -### During Training: Validation Loss - -The recipe automatically computes validation loss at the interval set by `val_every_steps`. Look for `[val]` lines in the training log: - -```text -INFO:root:[val] step 20 | epoch 0 | loss 0.2469 -``` - -A decreasing validation loss across checkpoints indicates the model is learning. If validation loss plateaus or increases while training loss continues to drop, the model may be overfitting — consider stopping earlier or reducing the learning rate. - -### After Training: lm-eval-harness - -For task-specific benchmarks (e.g., MMLU, GSM8k, HellaSwag accuracy), use [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) with the fine-tuned checkpoint: - -```bash -pip install lm-eval - -# SFT checkpoint (using vLLM backend for faster evaluation) -lm_eval --model vllm \ - --model_args pretrained=checkpoints/epoch_0_step_20/model/consolidated/ \ - --tasks hellaswag \ - --batch_size auto - -# PEFT adapter (using Hugging Face backend with built-in PEFT support) -lm_eval --model hf \ - --model_args pretrained=meta-llama/Llama-3.2-1B,peft=checkpoints/epoch_0_step_20/model/ \ - --tasks hellaswag \ - --batch_size auto -``` - -:::{tip} -The SFT example uses the `vllm` backend for faster evaluation (requires `pip install vllm`; see [Deploy with vLLM](#deploy-with-vllm) for setup details). The PEFT example uses the `hf` backend with lm-eval's built-in PEFT support to load the adapter on top of the base model. -::: - -:::{tip} -Run lm-eval-harness on the base model *before* fine-tuning to establish a baseline, then compare against the fine-tuned checkpoint. -::: - -## Publish to the Hugging Face Hub - -Fine-tuned checkpoints and PEFT adapters are stored in Hugging Face-native format and can be uploaded directly to the Hub. - -1. Install the Hugging Face Hub library (if not already installed): - -```bash -pip3 install huggingface_hub -``` - -2. Log in to Hugging Face: - -```bash -huggingface-cli login -``` - -3. Upload: - -**SFT checkpoint:** -```python -from huggingface_hub import HfApi - -api = HfApi() -api.upload_folder( - folder_path="checkpoints/epoch_0_step_10/model/consolidated", - repo_id="your-username/llama3.2_1b-finetuned-squad", - repo_type="model", -) -``` - -**PEFT adapter:** -```python -from huggingface_hub import HfApi - -api = HfApi() -api.upload_folder( - folder_path="checkpoints/epoch_0_step_10/model", - repo_id="your-username/llama3.2_1b-lora-squad", - repo_type="model", -) -``` - -Once uploaded, load the checkpoint or adapter directly from the Hub: - -**SFT:** -```python -from transformers import AutoModelForCausalLM - -model = AutoModelForCausalLM.from_pretrained("your-username/llama3.2_1b-finetuned-squad") -``` - -**PEFT:** -```python -from transformers import AutoModelForCausalLM -from peft import PeftModel - -model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") -model = PeftModel.from_pretrained(model, "your-username/llama3.2_1b-lora-squad") -``` - -## Deploy with vLLM - -[vLLM](https://github.com/vllm-project/vllm) is an efficient inference engine for production deployment of LLMs. - -:::{note} -Make sure vLLM is installed (`pip install vllm`, or use an environment that includes it). -::: - -### SFT Checkpoint with vLLM - -```python -from vllm import LLM, SamplingParams - -llm = LLM(model="checkpoints/epoch_0_step_10/model/consolidated/", model_impl="transformers") -params = SamplingParams(max_tokens=20) -outputs = llm.generate("Toronto is a city in Canada.", sampling_params=params) -print(f"Generated text: {outputs[0].outputs[0].text}") -``` -```text ->>> Generated text: It is the capital of Ontario. Toronto is a global hub for cultural tourism. The City of Toronto -``` - -### PEFT Adapter with vLLM - -PEFT adapter serving uses the `vLLMHFExporter` class, which is provided by the `nemo` package — a separate dependency from `nemo-automodel`. - -:::{important} -Install both packages before proceeding: -```bash -pip install nemo vllm -``` -::: - -```python -from nemo.export.vllm_hf_exporter import vLLMHFExporter - -if __name__ == '__main__': - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument('--model', required=True, type=str, help="Local path of the base model") - parser.add_argument('--lora-model', required=True, type=str, help="Local path of the LoRA adapter") - args = parser.parse_args() - - lora_model_name = "lora_model" - - exporter = vLLMHFExporter() - exporter.export(model=args.model, enable_lora=True) - exporter.add_lora_models(lora_model_name=lora_model_name, lora_model=args.lora_model) - - print("vLLM Output: ", exporter.forward(input_texts=["How are you doing?"], lora_model_name=lora_model_name)) -``` - -## Full Configuration Reference - -This section documents all available config fields for the fine-tuning recipe. For the quick-start config, see [Configure Your Training Recipe](#configure-your-training-recipe). - -### Switching Between SFT and PEFT - -The `peft:` section controls which mode runs: - -| Mode | What to do in the YAML | -|------|----------------------| -| **PEFT (LoRA)** | Include the `peft:` section as shown below. | -| **SFT (full-parameter)** | Remove/comment the `peft:` section entirely. | - -All other config sections remain the same for both modes. - -### Full Configuration - -:::{dropdown} Full Config -:open: -```yaml -# Recipe -# Selects which recipe class runs the training loop. -# Use a short name (auto-discovered) or a fully-qualified Python path: -# recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction -recipe: TrainFinetuneRecipeForNextTokenPrediction - -# Training Schedule -# Controls epoch count, batch sizes, and how often to checkpoint / validate. -# No _target_ — these are plain values read directly by the recipe. -step_scheduler: - grad_acc_steps: 4 # number of micro-batches accumulated before each optimizer - # step. Effective batch = grad_acc_steps × batch_size. - ckpt_every_steps: 10 # save a checkpoint every N gradient steps - val_every_steps: 10 # run the validation loop every N gradient steps - num_epochs: 1 # how many full passes over the training dataset - -# Process Group -# Initializes the PyTorch distributed process group. -# No _target_ — consumed directly by the recipe. -# You normally would not need to tune this. -dist_env: - backend: nccl # communication backend: "nccl" (GPU, recommended) or "gloo" (CPU) - timeout_minutes: 1 # timeout for collective operations; increase for large models - # that take longer to initialize - -# Distributed Strategy -# Determines how model weights, data, and compute are split across GPUs. -# No _target_ — consumed directly by the recipe. -# See "Distributed Training: TP, PP, CP, and EP" in Advanced Topics for details. -distributed: - strategy: fsdp2 # parallelism strategy: "fsdp2" (recommended), "megatron_fsdp", - # or "ddp". FSDP2 shards parameters and optimizer states across - # the data-parallel group. - dp_size: null # data-parallel group size. null = auto-detect from - # world_size ÷ (tp_size × cp_size × pp_size). - tp_size: 1 # tensor-parallel size: splits weight matrices across GPUs. - # Set to 2, 4, or 8 if the model doesn't fit on one GPU. - # Should divide evenly into the number of attention heads. - cp_size: 1 # context-parallel size: splits the input sequence across GPUs. - # Increase for very long contexts (e.g. 32k+ tokens). - sequence_parallel: false # when true, extends TP to also shard activations along - # the sequence dimension for additional memory savings - -# Random Number Generator -# _target_ → StatefulRNG: a checkpointable RNG that ensures identical sequences -# across training restarts. Seed and ranked are kwargs to StatefulRNG(). -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1111 # global random seed for reproducibility - ranked: true # when true, each GPU rank gets a unique RNG stream derived - # from the seed, so data shuffling differs per GPU - -# Model -# _target_ → NeMoAutoModelForCausalLM.from_pretrained: downloads (or loads from -# cache) a pretrained HuggingFace model and wraps it with NeMo distributed-training -# support. Any from_pretrained kwarg is accepted (cache_dir, torch_dtype, etc.). -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -# PEFT (remove / comment this entire section for full-parameter SFT) -# _target_ → PeftConfig: a dataclass describing which layers get LoRA adapters. -# The recipe passes this config into build_model(), which attaches adapters -# to the matching layers. -peft: - _target_: nemo_automodel.components._peft.lora.PeftConfig - target_modules: "*.proj" # glob pattern matched against fully-qualified layer names; - # "*.proj" matches every layer ending in "proj" - dim: 8 # low-rank dimension r — controls adapter capacity. - # Larger values are more expressive but use more memory. - alpha: 32 # LoRA scaling factor: adapter output is scaled by alpha/dim. - # Higher values give adapters more influence during training. - use_triton: True # use an optimized Triton kernel for LoRA forward/backward - # (requires the triton package) - -# Checkpointing -# No _target_ — plain key-value group consumed by the recipe. -checkpoint: - enabled: true # set to false to skip saving checkpoints entirely - checkpoint_dir: checkpoints/ # output directory. Docker users: bind-mount this path - # (e.g. -v $(pwd)/checkpoints:/workspace/checkpoints) - # to persist checkpoints across container restarts. - model_save_format: safetensors # "safetensors" (recommended, faster and safer) or - # "torch_save" (legacy pickle-based format) - save_consolidated: True # when true, writes a single HuggingFace-compatible checkpoint - # to model/consolidated/ that can be loaded directly by - # Transformers, vLLM, etc. Requires safetensors format. - -# Training Dataset -# _target_ → make_squad_dataset: a factory function that downloads the SQuAD -# dataset, tokenizes it, and returns a torch Dataset. To use a different dataset, -# change _target_ to another factory function (see the dataset guide). -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad # HuggingFace Hub dataset ID - split: train # which split to use (train, validation, test) - -# Validation Dataset -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation - limit_dataset_samples: 64 # cap validation set to 64 samples for faster eval loops; - # remove this line to use the full validation set - -# Training Dataloader -# _target_ → StatefulDataLoader: a checkpointable DataLoader from torchdata that -# saves and restores iteration state across training restarts, so resumed runs -# don't re-process already-seen batches. -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - # function that pads and batches individual samples - # into tensors; can be swapped for custom collation - batch_size: 8 # samples per micro-batch per GPU - shuffle: true # whether to shuffle the dataset each epoch - -# Validation Dataloader -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - batch_size: 8 - -# Loss Function -# _target_ → MaskedCrossEntropy: standard cross-entropy loss that automatically -# ignores padding tokens so they don't affect the gradient. -# Other available loss functions (swap _target_ to use): -# - nemo_automodel.components.loss.chunked_ce.ChunkedCrossEntropy -# Computes CE in chunks along the sequence dimension to reduce peak memory. -# Useful for very long sequences. Accepts chunk_len (default 32). -# - nemo_automodel.components.loss.linear_ce.FusedLinearCrossEntropy -# Fuses the final linear projection (lm_head) with the CE computation, -# avoiding the full logit tensor. Significant **memory savings** for large vocabs. -# - nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy -# TE-based parallel CE with a Triton kernel. Designed for tensor-parallel -# setups where logits are sharded across TP ranks. -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -# Optimizer -# _target_ → torch.optim.Adam: any torch.optim class can be used here (e.g. -# AdamW, SGD). All remaining keys become kwargs to the constructor. -optimizer: - _target_: torch.optim.Adam - lr: 1.0e-5 # learning rate — the most important hyperparameter to tune - betas: [0.9, 0.999] # Adam momentum coefficients (β₁ for mean, β₂ for variance) - eps: 1e-8 # small constant added to the denominator for numerical stability - weight_decay: 0 # L2 regularization strength (0 = no regularization) - -# Logging (optional) -# Uncomment to enable Weights & Biases experiment tracking. -# wandb: -# project: # W&B project name -# entity: # W&B team or username -# name: # display name for this run -# save_dir: # local directory for W&B artifacts -``` -::: - -### Config Field Reference - -| Section | Required? | What to change | -|---------|-----------|----------------| -| `model` | Yes | Set `pretrained_model_name_or_path` to your Hugging Face model ID. Source: [`auto_model.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/_transformers/auto_model.py). | -| `peft` | PEFT only | Remove entirely for SFT. Adjust `dim` and `alpha` to tune adapter capacity. `use_triton: True` enables an optimized LoRA kernel (requires the `triton` package). For reduced memory usage, see [QLoRA](#qlora-quantized-low-rank-adaptation). Source: [`lora.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/_peft/lora.py). | -| `dataset` | Yes | Change `_target_`, `dataset_name`, and `split` for your data. Source: [`squad.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py). | -| `dataloader` | Optional | Adjust `batch_size` and `shuffle`. Uses [`StatefulDataLoader`](https://meta-pytorch.org/data/main/torchdata.stateful_dataloader.html) for checkpointable iteration. Collation: [`utils.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/utils.py). | -| `loss_fn` | Optional | Default is [`MaskedCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/masked_ce.py). Alternatives: [`ChunkedCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/chunked_ce.py) (long sequences), [`FusedLinearCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/linear_ce.py) (large vocabs), [`TEParallelCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/te_parallel_ce.py) (tensor-parallel). | -| `rng` | Optional | Controls reproducibility. Source: [`rng.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/training/rng.py). | -| `step_scheduler` | Yes | `grad_acc_steps` sets how many micro-batches accumulate per gradient step. `ckpt_every_steps` and `val_every_steps` are counted in gradient steps. | -| `distributed` | Yes | `dp_size: null` means auto-detect from world size. Adjust `tp_size` for tensor parallelism across GPUs. | -| `checkpoint` | Recommended | Set `checkpoint_dir` to a persistent path, especially in Docker. | -| `optimizer` | Optional | Defaults are reasonable. Any `torch.optim` class can be substituted via `_target_`. | -| `wandb` | Optional | Uncomment and configure to enable Weights & Biases logging. | - -For the fine-tuning recipe itself, see [`train_ft.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py). For more example configs, browse [`examples/llm_finetune/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune). - -## Distributed Training: TP, PP, CP, and EP - -The `distributed:` section controls how the model and data are split across GPUs. NeMo AutoModel supports four parallelism dimensions, each of which slices the workload differently: - -| Dimension | Key | What it shards | When to use | -|-----------|-----|---------------|-------------| -| **Data Parallel (DP)** | `dp_size` | Replicates the model on each group of GPUs; each replica trains on a different data batch. | Default. Scales batch size linearly with GPU count. | -| **Tensor Parallel (TP)** | `tp_size` | Splits individual weight matrices (attention, MLP) across GPUs within a node. | Model is too large to fit on a single GPU, or you want to reduce per-GPU memory at the cost of extra communication. | -| **Pipeline Parallel (PP)** | `pp_size` | Assigns different *layers* (stages) to different GPUs and pipelines micro-batches through them. | Very deep models that don't fit even with TP, or multi-node training where TP's all-reduce is too expensive across nodes. | -| **Context Parallel (CP)** | `cp_size` | Splits the input *sequence* across GPUs so each GPU processes a portion of the context. | Very long sequences that exceed single-GPU memory. | -| **Expert Parallel (EP)** | `ep_size` | Distributes MoE experts across GPUs so each GPU holds a subset of experts. | Mixture-of-Experts models only. | - -These dimensions compose with each other. The relationship between them and total GPU count is: - -```text -world_size = pp_size × dp_size × cp_size × tp_size -``` - -When `dp_size` is set to `null` (the default), it is inferred automatically: - -```text -dp_size = world_size ÷ (tp_size × cp_size × pp_size) -``` - -EP does not appear in this formula — experts are distributed across the DP×CP rank groups, with the constraint that `(dp_size × cp_size)` must be divisible by `ep_size`. - -#### Data Parallel (default) - -Data parallelism is the default. With `strategy: fsdp2`, FSDP2 shards both model parameters and optimizer states across the DP group, so memory usage shrinks as you add GPUs: - -```yaml -distributed: - strategy: fsdp2 - dp_size: null # auto-detected from world_size ÷ (tp × cp × pp) - tp_size: 1 - cp_size: 1 -``` - -#### Tensor Parallelism - -TP splits weight matrices across GPUs within a single node. Set `tp_size` to the number of GPUs you want to shard over (typically 2, 4, or 8 — should divide evenly into the number of attention heads): - -```yaml -distributed: - strategy: fsdp2 - dp_size: null - tp_size: 4 - cp_size: 1 - sequence_parallel: false # set to true for additional memory savings -``` - -`sequence_parallel: true` extends TP to also shard activation memory along the sequence dimension, further reducing per-GPU memory at the cost of additional communication. - -#### Pipeline Parallelism - -PP assigns groups of layers to different GPUs and streams micro-batches through the stages. It requires an additional nested `pipeline:` section: - -```yaml -distributed: - strategy: fsdp2 - dp_size: null - tp_size: 4 - pp_size: 4 - cp_size: 1 - activation_checkpointing: true - - pipeline: - pp_schedule: interleaved1f1b # pipeline schedule (1f1b or interleaved1f1b) - pp_microbatch_size: 1 # micro-batch size per pipeline step - layers_per_stage: 4 # how many layers each stage handles - scale_grads_in_schedule: false -``` - -| Key | Role | -|-----|------| -| `pp_schedule` | The micro-batch schedule. `1f1b` is simpler; `interleaved1f1b` overlaps compute and communication for better throughput. | -| `pp_microbatch_size` | Number of samples per micro-batch fed into the pipeline. Must satisfy: `local_batch_size ÷ pp_microbatch_size ≥ pp_size`. | -| `layers_per_stage` | How many transformer layers each pipeline stage contains. If omitted, the framework splits layers evenly across `pp_size` stages. | - -:::{note} -PP requires the model to define a `_pp_plan` that tells the framework how to split layers into stages. All built-in models include this plan; custom models must add one. -::: - -#### Context Parallelism - -CP splits the sequence across GPUs — useful for very long contexts that exceed single-GPU memory. Set `cp_size` to the desired split factor: - -```yaml -distributed: - strategy: fsdp2 - dp_size: null - tp_size: 1 - cp_size: 2 -``` - -:::{important} -When `cp_size > 1`, fused RoPE is automatically disabled. Some models also require the Transformer Engine (TE) attention backend for CP with packed sequences — the framework will raise an error with instructions if this applies. -::: - -#### Expert Parallelism (MoE models) - -EP distributes MoE experts across GPUs. Set `ep_size` to the number of GPUs that share the full set of experts: - -```yaml -distributed: - strategy: fsdp2 - tp_size: 1 - cp_size: 1 - pp_size: 1 - ep_size: 8 - activation_checkpointing: true -``` - -EP only applies to Mixture-of-Experts models (e.g. Qwen3-MoE, Mixtral, DeepSeek-V3). For dense models, leave `ep_size` at `1` or omit it. - -#### Combining Multiple Dimensions - -You can combine TP, PP, CP, and EP in a single config. For example, a large MoE model on a multi-node cluster might use: - -```yaml -distributed: - strategy: fsdp2 - dp_size: null - tp_size: 1 - cp_size: 2 - pp_size: 1 - ep_size: 4 - activation_checkpointing: true -``` - -When choosing a combination, keep these rules in mind: - -- **`world_size` must divide evenly** into `pp_size × tp_size × cp_size` (the remainder becomes `dp_size`). -- **`(dp_size × cp_size) % ep_size == 0`** — EP shares the DP×CP groups. -- **TP within a node, PP across nodes** is the typical layout — TP requires fast NVLink bandwidth, while PP tolerates higher latency. -- **Start simple.** Use DP-only first. Add TP if the model doesn't fit on one GPU. Add PP for very large models. Add CP for long sequences. Add EP only for MoE architectures. - -## Next Steps - -- [Integrate Your Own Text Dataset](dataset.md) — swap the SQuAD example for your own data. -- [Recipes and End-to-End Examples](../overview.md) — browse the full set of recipes available in NeMo AutoModel. See also the [`examples/llm_finetune/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune) directory for ready-to-run configs. -- [Dataset Overview](../dataset-overview.md) — see all supported dataset types across LLM, VLM, and retrieval tasks. -- [Knowledge Distillation](knowledge-distillation.md) — distill a fine-tuned model into a smaller one. - -``` - -File: /Users/mromeijn/src/Automodel/docs/guides/llm/dataset.md -```md -# Integrate Your Own Text Dataset - -This guide shows you how to integrate your own dataset into NeMo Automodel for training. You'll learn about two main dataset types: **completion datasets** for language modeling (like [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag)) and **instruction datasets** for question-answering tasks (like [SQuAD](https://huggingface.co/datasets/rajpurkar/squad)). We'll cover how to create custom datasets by implementing the required methods and preprocessing functions, and finally show you how to specify your own data logic using YAML configuration with file paths—allowing you to define custom dataset processing without modifying the main codebase. - -## Quick Start Summary -| **Type** | **Use Case** | **Example** | **Preprocessor** | **Section** | -| --------------- | ------------------ | -------------- | --------------------------------- | --------------------------- | -| ✍️ Completion | Language modeling | HellaSwag | `SFTSingleTurnPreprocessor` | [Jump](#completion-datasets) | -| 🗣️ Instruction | Question answering | SQuAD | `make_*` function | [Jump](#instruction-datasets) | - -## Types of Supported Datasets - -NeMo Automodel supports a variety of datasets, depending on the task. -### Completion Datasets - -**Completion datasets** are single text sequences designed for language modeling where the model learns to predict the next token given a context. These datasets typically contain a context (prompt) and a target (completion) that the model should learn to generate. - -#### Example: HellaSwag - -The [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag) dataset is a popular completion dataset used for commonsense reasoning. It contains situations with multiple-choice endings where the model must choose the most plausible continuation. - -**HellaSwag dataset structure:** -- **Context (`ctx`)**: A situation or scenario description -- **Endings**: Multiple possible completions (4 options) -- **Label**: Index of the correct ending - -**Example:** -``` -Context: "A man is sitting at a piano in a large room." -Endings: [ - "He starts playing a beautiful melody.", - "He eats a sandwich while sitting there.", - "He suddenly becomes invisible.", - "He transforms into a robot." -] -Label: 0 # First ending is correct -``` - -#### Preprocessing with SFTSingleTurnPreprocessor - -NeMo Automodel provides the `SFTSingleTurnPreprocessor` class to handle completion datasets. This processor: - -1. **Extracts context and target** using `get_context()` and `get_target()`. -2. **Tokenizes and cleans** context and target separately. -3. **Concatenates** them into one sequence. -4. **Creates loss mask**: `-100` for context, target IDs for target. -5. **Pads** sequences to equal length. - - -#### Create Your Own Completion Dataset - -To adapt your dataset into this format, define a class like this: - -```python -from datasets import load_dataset -from nemo_automodel.components.datasets.utils import SFTSingleTurnPreprocessor - -class MyCompletionDataset: - def __init__(self, path_or_dataset, tokenizer, split="train"): - raw_datasets = load_dataset(path_or_dataset, split=split) - processor = SFTSingleTurnPreprocessor(tokenizer) - self.dataset = processor.process(raw_datasets, self) - - def get_context(self, examples): - """Extract context/prompt from your dataset""" - return examples["context_field"] # Replace with your context field - - def get_target(self, examples): - """Extract target/completion from your dataset""" - return examples["target_field"] # Replace with your target field - - def __getitem__(self, index): - return self.dataset[index] - - def __len__(self): - return len(self.dataset) -``` - - -### Instruction Datasets - -**Instruction datasets** are question-answer pairs where the model learns to respond to specific instructions or questions. These datasets are structured as context-question pairs with corresponding answers, making them ideal for teaching models to follow instructions and provide accurate responses. - -#### Example: SQuAD - -The [SQuAD (Stanford Question Answering Dataset)](https://huggingface.co/datasets/rajpurkar/squad) is a popular instruction dataset for reading comprehension. It contains questions based on Wikipedia articles along with their answers. - -**SQuAD dataset structure:** -- **Context**: A paragraph of text from Wikipedia -- **Question**: A question about the context -- **Answers**: The correct answer with its position in the context - -#### Create Your Own Instruction Dataset - -The [`squad.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py) file contains the implementation for processing the SQuAD dataset into a format suitable for instruction tuning. It defines a dataset class and preprocessing functions that extract the context, question, and answer fields, concatenate them into a prompt-completion format, and apply tokenization, padding, and loss masking. This serves as a template for building custom instruction datasets by following a similar structure and adapting the extraction logic to your dataset's schema. - -Based on the SQuAD implementation in `squad.py`, you can create your own instruction dataset using the `make_squad_dataset` pattern: - -```python -from datasets import load_dataset - -def make_my_instruction_dataset( - tokenizer, - seq_length=None, - limit_dataset_samples=None, - split="train", - dataset_name="your-dataset-name", -): - if limit_dataset_samples: - split = f"{split}[:{limit_dataset_samples}]" - - dataset = load_dataset(dataset_name, split=split) - - return dataset.map( - your_own_fmt_fn, # Your formatting function - batched=False, - remove_columns=dataset.column_names, - ) -``` - -## YAML-based Custom Dataset Configuration - -NeMo Automodel supports YAML-based dataset specification using the _target_ key. This lets you reference dataset-building classes or functions using either: - -- 1. Python Dotted Path - -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: train -``` - -- 2. File Path + Function Name - -``` -: -``` - -Where: -- ``: The absolute path to a Python file containing your dataset function -- ``: The name of the function to call from that file - -```yaml -dataset: - _target_: /path/to/your/custom_dataset.py:build_my_dataset - num_blocks: 111 -``` -This will call `build_my_dataset()` from the specified file with the other keys (e.g., num_blocks) as arguments. This approach allows you to integrate custom datasets via config alone—no need to alter the codebase or package structure. - - -## Packed Sequence Support in NeMo AutoModel -NeMo AutoModel supports **packed sequences**, a technique to optimize training with variable-length sequences (e.g., text) by minimizing padding. - -### What is a Packed Sequence? -Instead of padding each sequence to a fixed length (wasting computation on `[PAD]` tokens), packed sequences: -- Concatenate short sequences into a single continuous sequence. -- Separate sequences with special tokens (e.g., `[EOS]`). -- Track lengths via a "attention mask" to prevent cross-sequence information leakage. - -### Benefits -- Reduces redundant computation on padding tokens leading to faster training. -- Enables larger effective batch sizes leading to better GPU utilization. -- Especially useful for language modeling and text datasets. - - -### Enable Packed Sequences in NeMo Automodel - -To enable packed sequences, add these keys to your recipe's YAML config: -``` -packed_sequence: - # Set packed_sequence_size > 0 to run with packed sequences - packed_sequence_size: 1024 - split_across_pack: False -``` - -The `packed_sequence` has two options: -- **packed_sequence_size**: Defines the total token length of each packed sequence, higher values require higher GPU memory usage. -- **split_across_pack**: If two will split a sequence across different packed sequences. - - -## Troubleshooting Tips - -- **Tokenization Mismatch?** Ensure your tokenizer aligns with the model's expected inputs. -- **Dataset too large?** Use `limit_dataset_samples` in your YAML config to load a subset, useful for quick debugging. -- **Loss not decreasing?** Verify that your loss mask correctly ignores prompt tokens. - -``` - -File: /Users/mromeijn/src/Automodel/docs/guides/dataset-overview.md -```md -# Dataset Overview: LLM, VLM, and Retrieval Datasets - -This page summarizes the datasets supported in NeMo AutoModel for LLM, VLM, and retrieval training and shows how to plug in your own datasets using Python functions or the YAML `_target_` mechanism. - -- See also: [LLM datasets](llm/dataset.md), [VLM datasets](vlm/dataset.md), and [Retrieval dataset](llm/retrieval-dataset.md) for deeper, task-specific guides. - -- If a dataset you need is missing, please open a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues) with a short description and example schema so we can prioritize support. ---- - -## LLM Datasets - -NeMo AutoModel supports several common patterns for language modeling and instruction tuning. -### HellaSwag (Completion SFT) -- Wrapper: `nemo_automodel.components.datasets.llm.hellaswag.HellaSwag` -- Use case: single-turn completion-style SFT where a prompt (ctx) is followed by a gold continuation (ending) -- Key args: `path_or_dataset`, `split`, `num_samples_limit` -- Example YAML: -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: train -``` - -### SQuAD-Style Question Answering (QA) (Instruction SFT) -- Factory: `nemo_automodel.components.datasets.llm.squad.make_squad_dataset` -- Use case: instruction/QA tuning with either prompt-and-answer formatting or chat-template formatting -:::{note} -- If the tokenizer has a chat template and you want answer-only loss, you must provide `start_of_turn_token`. -- Optional `seq_length` can be used for padding/truncation. -::: -- Example YAML: -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - split: train - dataset_name: rajpurkar/squad - start_of_turn_token: "<|assistant|>" -``` - -- **ColumnMappedTextInstructionDataset (generic instruction SFT)** - - Class: `nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset` - - Use case: quickly adapt instruction datasets by mapping your schema's columns to `context`, `question`, `answer` - - Sources: local JSON/JSONL or Hugging Face Hub dataset ID - - Notes: - - For tokenizers with chat templates and answer-only loss, you may set `answer_only_loss_mask: true` and provide `start_of_turn_token`. - - Supports streaming mode for large datasets (see [Streaming Datasets](#streaming-datasets) section below). - - Map-style, non-streaming dataset (supports `len(ds)` and `ds[i]`) - - For streaming (including Delta Lake / Databricks), use `ColumnMappedTextInstructionIterableDataset` - - Example YAML: -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset - path_or_dataset_id: Muennighoff/natural-instructions - split: train - column_mapping: - context: definition - question: inputs - answer: targets - answer_only_loss_mask: true - start_of_turn_token: "<|assistant|>" -``` -See the detailed guide, [Column-Mapped Text Instruction Dataset](llm/column-mapped-text-instruction-dataset.md), for more information. - -- **ChatDataset (multi-turn conversations and tool calling)** - - Class: `nemo_automodel.components.datasets.llm.ChatDataset` - - Use case: multi-turn conversations and tool calling in OpenAI chat format - - Sources: local JSON/JSONL or Hugging Face Hub dataset ID - - Key args: - - `path_or_dataset_id`: path to local file(s) or HuggingFace dataset ID - - `tokenizer`: tokenizer instance (required. Must have chat template support) - - `split`: dataset split (e.g., "train", "validation") - - `name`: dataset configuration/subset name - - `seq_length`: maximum sequence length for padding/truncation - - `padding`: padding strategy ("do_not_pad", "max_length", etc.) - - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.) - - `start_of_turn_token`: token marking assistant response start (for answer-only loss) - - `chat_template`: optional override for tokenizer's chat template - - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line - - Notes: - - Requires a tokenizer with chat template support - - Supports both single-turn and multi-turn tool calling - - Tool definitions are provided in a `tools` field at the conversation level - - Tool calls appear in assistant messages via `tool_calls` field - - Tool responses use the `tool` role -### ChatDataset (Multi-Turn Conversations and Tool Calling) -- Class: `nemo_automodel.components.datasets.llm.ChatDataset` -- Use case: multi-turn conversations and tool calling in OpenAI chat format -- Sources: local JSON/JSONL or Hugging Face Hub dataset ID -- Key args: - - `path_or_dataset_id`: path to local file(s) or Hugging Face dataset ID - - `tokenizer`: tokenizer instance (required; must have chat template support) - - `split`: dataset split (e.g., "train", "validation") - - `name`: dataset configuration/subset name - - `seq_length`: maximum sequence length for padding/truncation - - `padding`: padding strategy ("do_not_pad", "max_length", etc.) - - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.) - - `start_of_turn_token`: token marking assistant response start (for answer-only loss) - - `chat_template`: optional override for tokenizer's chat template - - `mask_reasoning_content`: optionally exclude rendered `reasoning_content` tokens from loss - - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line -:::{note} -- Requires a tokenizer with chat template support -- Supports both single-turn and multi-turn tool calling -- Assistant messages may also include `reasoning_content` for structured reasoning traces -- Tool definitions are provided in a `tools` field at the conversation level -- Tool calls appear in assistant messages through the `tool_calls` field -- Tool responses use the `tool` role and must include `tool_call_id` -- If your dataset contains `reasoning_content`, your chat template must render it explicitly or it will be dropped -- For multi-turn tool-calling datasets, prefer chat templates that use `{% generation %}` blocks so assistant-turn loss masking is exact -- Set `mask_reasoning_content: true` if you want to train on the final assistant answer while excluding rendered reasoning traces from loss -- Set `skip_invalid_samples: true` for noisy local JSONL so lines that are not valid JSON are skipped instead of failing the load -::: -- Example YAML: -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.ChatDataset - path_or_dataset_id: Salesforce/xlam-function-calling-60k - split: train - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: google/functiongemma-270m-it - seq_length: 2048 - start_of_turn_token: "" - mask_reasoning_content: false - skip_invalid_samples: false -``` - - Expected data format (OpenAI messages format): -```json -{ - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "What's the weather in Seattle and should I bring an umbrella?" - }, - { - "role": "assistant", - "reasoning_content": "The user wants weather info and advice. I should call get_weather first, then decide whether an umbrella is needed.", - "content": "", - "tool_calls": [ - { - "id": "call_1", - "type": "function", - "function": { - "name": "get_weather", - "arguments": "{\"city\": \"Seattle\"}" - } - } - ] - }, - { - "role": "tool", - "tool_call_id": "call_1", - "content": "{\"temperature\": 55, \"condition\": \"rain\", \"precipitation_chance\": 0.85}" - }, - { - "role": "assistant", - "reasoning_content": "It is raining with a high precipitation chance, so I should recommend bringing an umbrella.", - "content": "It's currently 55 degrees F and raining in Seattle with an 85% chance of continued precipitation. Yes, definitely bring an umbrella." - } - ], - "tools": [ - { - "type": "function", - "function": { - "name": "get_weather", - "description": "Get current weather for a city", - "parameters": { - "type": "object", - "properties": { - "city": {"type": "string"} - }, - "required": ["city"] - } - } - } - ] -} -``` - - Template requirement example for `reasoning_content`: -```jinja -{%- if message.reasoning_content %} -{% generation %} -{{ "\n" + message.reasoning_content + "\n\n" }} -{% endgeneration %} -{%- endif %} -{% generation %} -{{ message.content }} -{% endgeneration %} -``` - - For single-turn tool calling (one tool call per conversation), omit the tool response and final assistant message: -```json -{ - "messages": [ - { - "role": "user", - "content": "Book a table for two at 7pm in Seattle." - }, - { - "role": "assistant", - "content": "", - "tool_calls": [ - { - "id": "call_1", - "type": "function", - "function": { - "name": "book_table", - "arguments": "{\"party_size\": 2, \"time\": \"19:00\", \"city\": \"Seattle\"}" - } - } - ] - } - ], - "tools": [ - { - "type": "function", - "function": { - "name": "book_table", - "description": "Book a restaurant table", - "parameters": { - "type": "object", - "properties": { - "party_size": {"type": "integer"}, - "time": {"type": "string"}, - "city": {"type": "string"} - } - } - } - } - ] -} -``` -See the [Function Calling guide](llm/toolcalling.md) for an end-to-end example with FunctionGemma. -For a small reasoning-style chat SFT starting point, see [qwen2_5_0p5b_instruct_fineproofs_chat.yaml](../../examples/llm_finetune/qwen/qwen2_5_0p5b_instruct_fineproofs_chat.yaml). - -### Retrieval (Embedding Fine-Tuning) -- Factory: `nemo_automodel.components.datasets.llm.make_retrieval_dataset` -- Collator: `nemo_automodel.components.datasets.llm.BiEncoderCollator` -- Use case: embedding model fine-tuning with (query, positive doc, negative docs) contrastive learning -- Supported schemas: - - Corpus-ID JSON (Merlin/NeMo-retriever style) - - Inline-text JSONL (e.g., `{"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]}`) -- Example YAML: -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset - data_dir_list: /abs/path/to/train.jsonl - data_type: train - n_passages: 5 -collate_fn: - _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator - q_max_len: 512 - p_max_len: 512 -``` -See the detailed guide, [Retrieval dataset](llm/retrieval-dataset.md), for more information. - -### NanoGPT Binary Shards (Pretraining) -- Class: `nemo_automodel.components.datasets.llm.nanogpt_dataset.NanogptDataset` -- Use case: token-level LM pretraining over `.bin` shards produced by NanoGPT-style preprocessors (supports legacy and current formats) -:::{note} -- Streams contiguous `seq_len` slices, supports optional BOS alignment and `.bos.idx` sidecar files -- Related tool: `tools/nanogpt_data_processor.py` -::: - -### Megatron (Pretraining; Interoperable With Pre-Tokenized Megatron Data) -- Class: `nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining` -- Use case: large-scale LM pretraining over Megatron-LM formatted tokenized corpora -- Interoperability: If your corpus has already been tokenized/indexed for Megatron (i.e., `.bin`/`.idx` pairs), you can point AutoModel to those assets directly. No re-tokenization required. -- Key args: `paths` (single path, glob, weighted list, or per-split dict), `seq_length`, `tokenizer`, `split`, `index_mapping_dir`, `splits_to_build` -- Example YAML: -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining - paths: /abs/path/to/processed_data_*_text_document* # glob or explicit list - index_mapping_dir: /abs/path/to/mapping_dir - tokenizer: - _target_: transformers.AutoTokenizer.from_pretrained - pretrained_model_name_or_path: openai-community/gpt2 - seq_length: 1024 - split: "0.99, 0.01, 0.00" # train, validation, test - splits_to_build: "train" -``` -See the detailed [pretraining guide](llm/pretraining.md), which uses MegatronPretraining data. - -## Streaming Datasets - -Streaming datasets enable processing very large datasets without loading them entirely into memory. This is particularly useful when working with datasets that exceed available RAM or when you want to start training immediately without waiting for the full dataset to download. - -### What Are Streaming Datasets? - -Streaming datasets load and process data incrementally, one batch at a time, rather than loading the entire dataset into memory upfront. This approach: - -- **Reduces memory footprint**: Only the current batch resides in memory -- **Enables training on massive datasets**: Process terabyte-scale datasets on machines with limited RAM -- **Faster startup**: Begin training immediately without waiting for full dataset download -- **Efficient for remote datasets**: Stream directly from Hugging Face Hub without local storage - -### When to Use Streaming - -Use streaming mode when: - -- Your dataset is very large (hundreds of GB or TB) -- Available memory is limited compared to dataset size -- You want to start training quickly without downloading the full dataset -- You're experimenting with a subset of a large dataset - -Avoid streaming when: - -- Your dataset is small enough to fit comfortably in memory -- You need random access to samples (e.g., for certain sampling strategies) -- You need to know the exact dataset length upfront -- Training requires multiple passes with different orderings - -### How to Enable Streaming - -For `ColumnMappedTextInstructionDataset`, use the streaming variant by changing the class to `ColumnMappedTextInstructionIterableDataset`: - -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset - path_or_dataset_id: Muennighoff/natural-instructions - split: train - column_mapping: - context: definition - question: inputs - answer: targets - answer_only_loss_mask: true - start_of_turn_token: "<|assistant|>" -``` - -For Hugging Face datasets loaded directly, set `streaming=True`: - -```python -from datasets import load_dataset - -# Non-streaming (loads entire dataset into memory) -dataset = load_dataset("large-dataset/corpus", split="train", streaming=False) - -# Streaming (loads data incrementally) -dataset = load_dataset("large-dataset/corpus", split="train", streaming=True) -``` - -### Streaming Limitations - -When using streaming datasets, be aware of these limitations: - -1. **No random access**: You cannot use `dataset[index]` to access specific samples. Streaming datasets only support iteration. - -2. **No length information**: The `len(dataset)` operation is not available. You cannot determine the total number of samples upfront. - -3. **Single-pass iteration**: Each iteration consumes the stream. To iterate multiple times, you need to recreate the dataset or use the `repeat_on_exhaustion` parameter. - -4. **Limited shuffling**: Shuffling is done with a buffer (not the entire dataset), which may not provide perfect randomization. - -### Distributed Training with Streaming - -Streaming datasets support distributed training through sharding: - -```python -from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset import ( - ColumnMappedTextInstructionIterableDataset -) - -dataset = ColumnMappedTextInstructionIterableDataset( - path_or_dataset_id="large-dataset/corpus", - column_mapping={"question": "input", "answer": "output"}, - tokenizer=tokenizer, -) - -# Shard the dataset across workers -dataset = dataset.shard(num_shards=8, index=worker_id) - -# Enable shuffling with a buffer -dataset = dataset.shuffle(buffer_size=10000, seed=42) - -# Set epoch for deterministic shuffling across epochs -dataset.set_epoch(epoch_num) -``` - -### Performance Considerations - -**Memory vs. Speed Trade-offs**: -- Streaming reduces memory usage but may be slower than in-memory datasets -- Network latency can impact streaming performance for remote datasets -- Use local caching when repeatedly accessing the same remote dataset - -**Buffer Size for Shuffling**: -- Larger buffers provide better randomization but use more memory -- A buffer size of 10,000-100,000 samples is typically a good balance -- For perfect shuffling, you need a buffer size equal to the dataset size (defeating the purpose of streaming) - -**Prefetching**: -- Most streaming implementations prefetch data in the background -- This helps hide network latency and keeps GPUs busy -- Adjust prefetch settings based on your network speed and batch size - -### Example: Streaming a Large Dataset - -Here's a complete example of using streaming for a large instruction-tuning dataset: - -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset - path_or_dataset_id: HuggingFaceH4/ultrachat_200k - split: train_sft - column_mapping: - question: prompt - answer: completion - answer_only_loss_mask: true - start_of_turn_token: "<|assistant|>" - repeat_on_exhaustion: true # Automatically restart when stream ends - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - batch_size: 4 - num_workers: 4 -``` - -This configuration: -- Streams the dataset without loading it fully into memory -- Automatically repeats when the stream is exhausted -- Uses multiple workers for efficient data loading -- Applies answer-only loss masking during tokenization - -## Packed Sequence Support -To reduce padding and improve throughput with variable-length sequences: -```yaml -packed_sequence: - packed_sequence_size: 8192 # > 0 enables packing - split_across_pack: false -``` -Use a collator that pads to an FP8-friendly multiple when training with FP8: -```yaml -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - pad_seq_len_divisible: 16 -``` - ---- - -## VLM Datasets (Vision/Audio + Language) -VLM datasets are represented as conversations (message lists) that combine text with images or audio and are processed with the model's `AutoProcessor.apply_chat_template` and a suitable collate function. - -Built-in dataset makers (return lists of `conversation` dicts): -- **RDR items**: `nemo_automodel.components.datasets.vlm.datasets.make_rdr_dataset` (HF: `quintend/rdr-items`) -- **CORD-V2 receipts (Consolidated Receipt Dataset for Post-OCR Parsing)**: `nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset` (HF: `naver-clova-ix/cord-v2`) -- **MedPix-VQA (Medical Pixel Question Answering)**: `nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset` -- **CommonVoice 17 (CV17) (audio)**: `nemo_automodel.components.datasets.vlm.datasets.make_cv17_dataset` - - -Each example follows the conversation schema expected by `apply_chat_template`, e.g.: -```python -{ - "conversation": [ - { - "role": "user", - "content": [ - {"type": "image", "image": example_image}, - {"type": "text", "text": "Describe this image."} - ] - }, - { - "role": "assistant", - "content": [{"type": "text", "text": ground_truth_text}] - } - ] -} -``` - -### Custom Chat Template -By default, VLM fine-tuning uses the chat template built into the model's `AutoProcessor`. To override it, add `chat_template` under `dataset:` in your YAML config: - -```yaml -dataset: - _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset - split: train - chat_template: "{% for msg in messages %}{{ msg.role }}: {{ msg.content }}\n{% endfor %}" -``` - -`chat_template` accepts a Jinja template string, a path to a `.jinja` file, or a path to a JSON file containing a `chat_template` key. The override is applied to both the processor and its tokenizer before dataset instantiation. - -### Collate Functions -- `nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn` -- `nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn` (Qwen2.5 VL) -- `nemo_automodel.components.datasets.vlm.collate_fns.phi4_mm_collate_fn` (audio) - -Select in your YAML: -```yaml -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - batch_size: 1 - collate_fn: - _target_: nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn -``` -If you want answer-only loss masking, provide a model-appropriate `start_of_response_token` to the collate function. - -See [Gemma-3n](omni/gemma3-3n.md) and [VLM dataset](vlm/dataset.md) for end-to-end examples. - ---- - -## Diffusion Datasets - -Diffusion models don't train directly on raw images or videos. Instead, the data is first encoded into a compact numerical representation called a latent — this is what the model actually learns from. Text captions are similarly converted into text embeddings that the model uses as conditioning. - -This encoding is done once during preprocessing, and the results are saved as cache files (.meta). Training then reads these cache files directly, which is significantly faster than re-encoding on every step. - -The built-in preprocessing tool ([`tools/diffusion/preprocessing_multiprocess.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/diffusion/preprocessing_multiprocess.py)) handles this conversion. It uses a VAE (Variational Autoencoder) to encode visual data and a text encoder for captions, grouping outputs into resolution-bucketed directories compatible with the multiresolution dataloader. - -### Dataloader Builders - -- **Video (T2V)**: `nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader` — for Wan 2.1 and HunyuanVideo -- **Image (T2I)**: `nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader` — for FLUX.1-dev - -### Example YAML (Video Dataloader) - -```yaml -data: - dataloader: - _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader - cache_dir: /path/to/processed_meta - model_type: wan - base_resolution: [512, 512] - dynamic_batch_size: false - shuffle: true - drop_last: false - num_workers: 0 -``` - -See the [Diffusion Dataset Preparation](diffusion/dataset.md) guide for full preprocessing instructions and configuration details. - ---- - -## Bring Your Own Dataset -You can integrate custom datasets with zero code changes to NeMo AutoModel by using `_target_` in YAML. There are three approaches: - -### Point to an Existing Class or Function (Dotted Path) -- LLM example (class): -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag - path_or_dataset: rowan/hellaswag - split: train -``` -- LLM example (factory function): -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - split: train - dataset_name: rajpurkar/squad -``` -- VLM example (factory function): -```yaml -dataset: - _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset - split: train -``` - -### Point to a Local Python File and Function -```yaml -dataset: - _target_: /abs/path/to/my_custom_dataset.py:build_my_dataset - some_arg: 123 - split: train -``` -Where `build_my_dataset` returns either a `datasets.Dataset` or a list/iterator of conversation dicts (for VLM). - -### Use ColumnMappedTextInstructionDataset for Most Instruction Datasets (LLM) -- Ideal when your data has columns like `instruction`, `input`, or `output` but with arbitrary names -- Supports local JSON/JSONL and HF Hub -```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset - path_or_dataset_id: /abs/path/to/*.jsonl # or org/repo on HF - column_mapping: - context: definition - question: inputs - answer: targets - answer_only_loss_mask: true - start_of_turn_token: "<|assistant|>" -``` - -### Implement a Minimal Custom Class Pattern (LLM Completion) -If you prefer Python, implement `get_context` and `get_target` and reuse the built-in preprocessor: -```python -from datasets import load_dataset -from nemo_automodel.components.datasets.utils import SFTSingleTurnPreprocessor - -class MyCompletionDataset: - def __init__(self, path_or_dataset, tokenizer, split="train"): - raw_ds = load_dataset(path_or_dataset, split=split) - self.dataset = SFTSingleTurnPreprocessor(tokenizer).process(raw_ds, self) - - def get_context(self, examples): - return examples["my_context_field"] - - def get_target(self, examples): - return examples["my_target_field"] -``` -Then reference your class with `_target_` in YAML. - -### Important Considerations -- **Chat templates**: If your tokenizer has a chat template and you want answer-only loss, provide the correct `start_of_turn_token` (LLM) or `start_of_response_token` (VLM collate functions). -- **Padding for FP8**: If training with FP8, set `pad_seq_len_divisible: 16` in your collate function to align sequence lengths. -- **Packed sequences**: Prefer packed sequences for throughput when fine-tuning LLMs on variable-length corpora. -- **Validation**: You can define a separate `validation_dataset` and `validation_dataloader` block mirroring your training config. - -For detailed, end-to-end recipes, browse the example configs under [examples/llm_finetune/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune), [examples/llm_pretrain/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_pretrain), and [examples/vlm_finetune/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_finetune). - -``` - -File: /Users/mromeijn/src/Automodel/docs/guides/checkpointing.md -```md -# Checkpointing - -## Introduction - -During machine-learning experiments, the model-training routine regularly saves checkpoints. A checkpoint is a complete snapshot of a run that includes model weights, optimizer states, and other metadata required to resume training exactly where it left off. Writing these snapshots at regular intervals lets you recover quickly from crashes or pauses without losing progress. - -NeMo Automodel checkpoints capture the complete state of a distributed training run across multiple GPUs or nodes. This reduces memory overhead, improves GPU utilization, and allows training to be resumed with a different parallelism strategy. - -NeMo Automodel writes checkpoints in two formats: [Hugging Face Safetensors](https://github.com/huggingface/safetensors) and [PyTorch Distributed Checkpointing (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html). It also supports two layouts: - -- **Consolidated Checkpoints**: The complete model state is saved as a Hugging Face-compatible bundle, typically in a single file or a compact set of files with an index. Because tensors are not split across GPUs (unsharded), tools like Hugging Face, vLLM, and SGLang can load these checkpoints directly. - -- **Sharded Checkpoints**: During distributed training with parameter sharing, each GPU holds a subset (or "shard") of the full state, such as model weights and optimizer states. When checkpointing, each GPU writes its own shard independently without reconstructing the full model state. - -We provide an overview of the different types of available checkpoint formats in the table below. - -Task | Model domain | DCP (sharded) | Safetensors (sharded) | Safetensors (consolidated) | ------|----------------------|:-----------:|:-------------------:|:------------------------:| -SFT | LLM | ✅ | ✅ | ✅ | -SFT | VLM | ✅ | ✅ | ✅ | -PEFT | LLM / VLM | 🚧 | 🚧 | ✅ | - - -Changing between output formats can be done seamlessly through the recipe's `yaml` configuration file: -```yaml -checkpoint: - ... - model_save_format: safetensors # Format for saving (torch_save or safetensors) - save_consolidated: true # Change to false if you want to save sharded checkpoints. - ... -``` -> **Note:** For optimal compatibility with the Hugging Face ecosystem, including downstream tools such as vLLM and SGLang, we recommend using the checkpoint configuration provided above. - -::: {note} -The optimizer states are _always_ saved in DCP (`.distcp` extension) format. -::: - -## Checkpoint Symbolic Links - -NeMo Automodel automatically creates symbolic links in the checkpoint directory to provide convenient access to important checkpoints: - -- **LATEST**: Points to the most recently saved checkpoint. This is useful for resuming training from the last saved state. -- **LOWEST_VAL**: Points to the checkpoint with the lowest validation score/loss. This provides easy access to the best-performing checkpoint based on validation metrics, making it ideal for model evaluation or deployment. - -These symbolic links eliminate the need to manually track checkpoint names or search through directories to find the best model. When validation is enabled in your training run, both links are automatically maintained and updated as training progresses. - -## Safetensors -To ensure seamless integration with the Hugging Face ecosystem, NeMo Automodel saves checkpoints in the [Safetensors](https://github.com/huggingface/safetensors) format. Safetensors is a memory-safe, zero-copy alternative to Python's pickle (PyTorch .bin), natively supported by Hugging Face Transformers, offering both safety and performance advantages over Python pickle-based approaches. - -### Key Benefits: -- **Native Hugging Face Compatibility**: Checkpoints can be loaded directly into Hugging Face-compatible tools, including vLLM, SGLang, and others. -- **Memory Safety and Speed**: The Safetensors format prohibits saving serialized Python code, ensuring memory safety, and supports zero-copy loading for improved performance. -- **Optional Consolidation**: Sharded checkpoints can be merged into a standard Hugging Face model format for easier downstream use. - -**Most importantly**, this format offers the added advantage of optionally consolidating multiple shards into a complete Hugging Face format model. - -### Example - -The following command runs the LLM fine-tuning recipe on two GPUs and saves the resulting checkpoint in the Safetensors format: -```bash -automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \ - --step_scheduler.ckpt_every_steps 20 \ - --checkpoint.model_save_format safetensors \ - --checkpoint.save_consolidated True -``` - -::: {note} -In the above command we used the [`llama3_2_1b_squad.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/492add84a2b9d495946fe211c28973cd00051f3e/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) config as a running example, adjust as necessary to your case. -More config examples can be found in our [`examples/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples) directory. -::: - -If you're running on a single GPU, you can run: -```bash -automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \ - --step_scheduler.ckpt_every_steps 20 \ - --checkpoint.model_save_format safetensors \ - --checkpoint.save_consolidated True -``` - -After running for a few seconds, the standard output should be: -``` -... -> Saving checkpoint to checkpoints/epoch_0_step_20 -... -``` - -The `checkpoints/` should have the following contents: -``` -checkpoints/ -├── LATEST -> epoch_0_step_20 -├── LOWEST_VAL -> epoch_0_step_20 -└── epoch_0_step_20 - ├── model - │ ├── consolidated - │ │ ├── config.json - │ │ ├── generation_config.json - │ │ ├── model-00001-of-00001.safetensors - │ │ ├── model.safetensors.index.json - │ │ ├── special_tokens_map.json - │ │ ├── tokenizer.json - │ │ └── tokenizer_config.json - │ ├── shard-00001-model-00001-of-00001.safetensors - │ └── shard-00002-model-00001-of-00001.safetensors - └── optim - ├── __0_0.distcp - └── __1_0.distcp -... -``` - -The `epoch_0_step_20/` directory stores the full training state from step `20` of the first epoch, including both the model and optimizer states. - -We can load and run the consolidated checkpoint using the Hugging Face Transformers API directly: -```python -import torch -from transformers import pipeline - -model_id = "checkpoints/epoch_0_step_20/model/consolidated/" -pipe = pipeline( - "text-generation", - model=model_id, - torch_dtype=torch.bfloat16, - device_map="auto", -) - -print(pipe("The key to life is")) - ->>> [{'generated_text': 'The key to life is to be happy. The key to happiness is to be kind. The key to kindness is to be'}] -``` - -Although this example uses the Hugging Face Transformers API, the `consolidated/` checkpoint is compatible with any Hugging Face-compatible tool, such as vLLM, SGLang, and others. - - -## PEFT -When training with Parameter-Efficient Fine-Tuning (PEFT) techniques, only a small subset of model weights are updated — the rest of the model remains frozen. This dramatically reduces the size of the checkpoint, often to just a few megabytes. - -### Why Consolidated Checkpoints? -Because the PEFT state is so lightweight, sharded checkpointing adds unnecessary overhead. Instead, NeMo Automodel automatically saves a single, consolidated Hugging Face–compatible checkpoint when using PEFT. This makes it: - -- easier to manage and share (just the adapters), -- compatible with Hugging Face Transformers out of the box, -- ideal for deployment and downstream evaluation. - -### Example: PEFT Fine-Tuning on Two GPUs - -To fine-tune a model using PEFT and save a Hugging Face–ready checkpoint: -```bash -automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag_peft.yaml --step_scheduler.ckpt_every_steps 20 --checkpoint.model_save_format safetensors -``` - -After training, you'll get a compact, consolidated Safetensors checkpoint that can be loaded directly with Hugging Face tools: - -``` -checkpoints/ -├── LATEST -> epoch_0_step_20 -├── LOWEST_VAL -> epoch_0_step_20 -├── epoch_0_step_20 -│ ├── config.yaml -│ ├── dataloader -│ │ ├── dataloader_dp_rank_0.pt -│ │ └── dataloader_dp_rank_1.pt -│ ├── losses.json -│ ├── model -│ │ ├── adapter_config.json -│ │ ├── adapter_model.safetensors -│ │ ├── automodel_peft_config.json -│ │ ├── special_tokens_map.json -│ │ ├── tokenizer.json -│ │ └── tokenizer_config.json -│ ├── optim -│ │ ├── __0_0.distcp -│ │ └── __1_0.distcp -│ ├── rng -│ │ ├── rng_dp_rank_0.pt -│ │ └── rng_dp_rank_1.pt -│ └── step_scheduler.pt -├── training.jsonl -└── validation.jsonl -``` - -The example below showcases the direct compatibility of NeMo Automodel with Hugging Face and PEFT: -```python -from peft import AutoPeftModelForCausalLM -from transformers import AutoTokenizer - -checkpoint_path = "checkpoints/epoch_0_step_20/model/" -model = AutoPeftModelForCausalLM.from_pretrained(checkpoint_path) -tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) - -model = model.to("cuda") -model.eval() -inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt") - -outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50) -print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]) - ->>> Preheat the oven to 350 degrees and place the cookie dough in a large bowl. Roll the dough into 1-inch balls and place them on a cookie sheet. Bake the cookies for 10 minutes. While the cookies are baking, melt the chocolate chips in the microwave for 30 seconds. -``` - -## PyTorch DCP -NeMo Automodel also offers native PyTorch DCP checkpointing support (`.distcp` extension). Similar to Safetensors, it also provides the same features of load-time resharding and parallel saving. - -As a simple example, we can run the following command to launch the training recipe on two GPUs. -```bash -automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \ - --step_scheduler.ckpt_every_steps 20 \ - --checkpoint.model_save_format torch_save - -... -> Saving checkpoint to checkpoints/epoch_0_step_20 -... -``` -After 20 steps, the following checkpoint will be saved: - -``` -checkpoints/ -├── LATEST -> epoch_0_step_20 -├── LOWEST_VAL -> epoch_0_step_20 -└── epoch_0_step_20 - ├── config.yaml - ├── dataloader - │ ├── dataloader_dp_rank_0.pt - │ └── dataloader_dp_rank_1.pt - ├── losses.json - ├── model - │ ├── __0_0.distcp - │ └── __1_0.distcp - └── optim - ├── __0_0.distcp - └── __1_0.distcp -... -``` - -If you rerun the script, NeMo Automodel automatically detects and restores the most recent checkpoint. -```bash -automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \ - --step_scheduler.ckpt_every_steps 20 \ - --checkpoint.model_save_format torch_save - -... -> Loading checkpoint from checkpoints/epoch_0_step_20 -... -``` - -## Saving Checkpoints When Using Docker - -When training inside a Docker container (see [Installation Guide](installation.md)), any files written to the container's filesystem are lost when the container exits (especially with `--rm`). To keep your checkpoints, you must **bind-mount a host directory** to the checkpoint path before starting the container: - -```bash -docker run --gpus all -it --rm \ - --shm-size=8g \ - -v "$(pwd)"/checkpoints:/opt/Automodel/checkpoints \ - nvcr.io/nvidia/nemo-automodel:25.11.00 -``` - -You can also set a custom checkpoint directory via the YAML config or CLI override: -```yaml -checkpoint: - checkpoint_dir: /mnt/shared/my_checkpoints -``` -```bash -# Or via CLI override: -automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \ - --checkpoint.checkpoint_dir /mnt/shared/my_checkpoints -``` - -When using a custom path, make sure the corresponding host directory is mounted into the container with `-v`. - -::: {tip} -Mount additional host directories for datasets and the Hugging Face model cache to avoid re-downloading large models across container restarts. See the [Installation Guide](installation.md) for a complete `docker run` example with all recommended mounts. -::: - -## Asynchronous Checkpointing - -NeMo Automodel can write checkpoints asynchronously to reduce training stalls caused by I/O. When enabled, checkpoint writes are scheduled in the background using PyTorch Distributed Checkpointing's async API while training continues. - -- **Enable** (YAML): - ```yaml - checkpoint: - is_async: true - ``` -- **Enable** (CLI): add `--checkpoint.is_async True` to your run command. -- **Requirements**: PyTorch ≥ 2.9.0. If an older version is detected, async mode is automatically disabled. -- **Behavior**: At most one checkpoint uploads at a time; the next save waits for the previous upload to finish. The `LATEST` symlink is updated after the async save completes (may be deferred until the next save call). During PEFT, adapter model files are written synchronously on rank 0; optimizer states can still use async. - -## Advanced Usage: Save Additional States -You can also save additional states in NeMo Automodel. By default, we also automatically checkpoint the `dataloader`, `rng`, and `step_scheduler` states which are necessary to resume training accurately. In full, a Safetensors consolidated checkpoint will look like this: - -``` -checkpoints/ -├── LATEST -> epoch_0_step_20 -├── LOWEST_VAL -> epoch_0_step_20 -├── epoch_0_step_20 -│ ├── config.yaml -│ ├── dataloader -│ │ ├── dataloader_dp_rank_0.pt -│ │ └── dataloader_dp_rank_1.pt -│ ├── losses.json -│ ├── model -│ │ ├── consolidated -│ │ │ ├── config.json -│ │ │ ├── generation_config.json -│ │ │ ├── model-00001-of-00001.safetensors -│ │ │ ├── model.safetensors.index.json -│ │ │ ├── special_tokens_map.json -│ │ │ ├── tokenizer.json -│ │ │ └── tokenizer_config.json -│ │ ├── shard-00001-model-00001-of-00001.safetensors -│ │ └── shard-00002-model-00001-of-00001.safetensors -│ ├── optim -│ │ ├── __0_0.distcp -│ │ └── __1_0.distcp -│ ├── rng -│ │ ├── rng_dp_rank_0.pt -│ │ └── rng_dp_rank_1.pt -│ └── step_scheduler.pt -├── training.jsonl -└── validation.jsonl -``` - -If you want to define a new state to be checkpointed in the recipe, the easiest way is to create a new attribute in the recipe class (defined using `self.` inside the recipe). Just make sure that the new attribute uses both the `load_state_dict` and `state_dict` methods. - -Here is an example of what it might look like: - -```python - -class NewState: - - def __init__(self, ...): - self.state_value = ... - self.another_value = ... - ... - - def state_dict(self) -> dict[str, Any]: - return { - "": self.state_value, - "": self.another_value, - } - - def load_state_dict(self, state_dict: dict[str, Any]) -> None: - self.state_value = state_dict[""] - self.another_value = state_dict[""] -``` - -Inside your recipe class, define the new state as an instance attribute using `self.new_state = NewState(...)`. - -``` - -File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -```yaml -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - - -# To run this recipe: -# automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml --nproc-per-node 8 -# Adjust --nproc-per-node to the number of GPUs available on your machine. - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 64 - local_batch_size: 8 - ckpt_every_steps: 1000 - val_every_steps: 10 # will run every x number of gradient steps - num_epochs: 1 - -dist_env: - backend: nccl - timeout_minutes: 1 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1111 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - -# torch.compile configuration -compile: - enabled: false - mode: "default" # Options: "default", "reduce-overhead", "max-autotune" - fullgraph: false - dynamic: true # Set to false for better performance with fixed shapes - backend: null # Use default backend (inductor) - -clip_grad_norm: - max_norm: 1.0 - -distributed: - strategy: fsdp2 - dp_size: none - tp_size: 1 - cp_size: 1 - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: train - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - shuffle: false - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation - limit_dataset_samples: 64 - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: - _target_: nemo_automodel.components.datasets.utils.default_collater - -optimizer: - _target_: torch.optim.Adam - betas: [0.9, 0.999] - eps: 1e-8 - lr: 1.0e-5 - weight_decay: 0 - # min_lr: 1.0e-5 - -lr_scheduler: - lr_decay_style: cosine - min_lr: 1.0e-6 - -# Uncomment and configure for W&B logging -# wandb: -# project: -# entity: -# name: -# save_dir: - -# Uncomment and configure for Mlflow logging -# mlflow: -# experiment_name: "automodel-llm-llama3_2_1b_squad-finetune" -# run_name: "" -# tracking_uri: null -# artifact_location: null -# tags: -# task: "squad-finetune" -# model_family: "llama3.2" -# model_size: "1b" -# dataset: "squad" -# framework: "automodel" - -ci: - recipe_owner: akoumpa - -``` - -File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml -```yaml -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - - -# To run this recipe: -# automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml --nproc-per-node 8 -# Adjust --nproc-per-node to the number of GPUs available on your machine. - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 64 - local_batch_size: 8 - ckpt_every_steps: 1000 - val_every_steps: 10 # will run every x number of gradient steps - num_epochs: 1 - -dist_env: - backend: nccl - timeout_minutes: 1 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 1111 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.2-1B - - -peft: - _target_: nemo_automodel.components._peft.lora.PeftConfig - match_all_linear: True - dim: 8 - alpha: 32 - use_triton: True - -distributed: - strategy: fsdp2 - dp_size: none - tp_size: 1 - cp_size: 1 - - sequence_parallel: false - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: train - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - shuffle: false - -validation_dataset: - _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset - dataset_name: rajpurkar/squad - split: validation - limit_dataset_samples: 64 - -validation_dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - -optimizer: - _target_: torch.optim.Adam - betas: [0.9, 0.999] - eps: 1e-8 - lr: 1.0e-5 - weight_decay: 0 - # min_lr: 1.0e-5 - -lr_scheduler: - lr_decay_style: cosine - min_lr: 1.0e-6 - -# Uncomment and configure for W&B logging -# wandb: -# project: -# entity: -# name: -# save_dir: - -ci: - recipe_owner: akoumpa - -``` - -File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_1/llama3_1_8b_columnmapped_lora.yaml -```yaml -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# QLora configuration for Llama-3.1-8B on SQuAD dataset -# Uses 4-bit quantization with LoRA adapters -# -# To run this recipe: -# automodel examples/llm_finetune/llama3_1/llama3_1_8b_columnmapped_lora.yaml --nproc-per-node 8 -# Adjust --nproc-per-node to the number of GPUs available on your machine. - -recipe: TrainFinetuneRecipeForNextTokenPrediction - -step_scheduler: - global_batch_size: 32 - local_batch_size: 4 - ckpt_every_steps: 100 - val_every_steps: 600 - max_steps: 500 - -dist_env: - backend: nccl - timeout_minutes: 1 - -rng: - _target_: nemo_automodel.components.training.rng.StatefulRNG - seed: 42 - ranked: true - -model: - _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained - pretrained_model_name_or_path: meta-llama/Llama-3.1-8B - -peft: - _target_: nemo_automodel.components._peft.lora.PeftConfig - match_all_linear: true - dim: 16 - alpha: 32 - dropout: 0.1 - -distributed: - strategy: fsdp2 - dp_size: none - tp_size: 1 - cp_size: 1 - - sequence_parallel: false - -loss_fn: - _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy - -dataset: - _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset - path_or_dataset_id: Muennighoff/natural-instructions - split: train - column_mapping: - context: definition - question: inputs - answer: targets - -packed_sequence: - packed_sequence_size: 0 - -dataloader: - _target_: torchdata.stateful_dataloader.StatefulDataLoader - collate_fn: nemo_automodel.components.datasets.utils.default_collater - shuffle: false - -# validation_dataset: -# _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset -# path_or_dataset_id: Muennighoff/natural-instructions -# split: validation -# column_mapping: -# instruction: definition -# question: inputs -# answer: targets - -# validation_dataloader: -# _target_: torchdata.stateful_dataloader.StatefulDataLoader -# collate_fn: nemo_automodel.components.datasets.utils.default_collater -# shuffle: false - -optimizer: - _target_: torch.optim.AdamW - betas: [0.9, 0.999] - eps: 1e-8 - lr: 1.0e-5 - weight_decay: 0.01 - -# Uncomment and configure for W&B logging -# wandb: -# project: -# entity: -# name: llama3_1_8b_squad_qlora -# save_dir: - -ci: - recipe_owner: akoumpa - time: "00:15:00" - -``` - diff --git a/skills/nemotron-customize/context/curator-data-acquisition.txt b/skills/nemotron-customize/context/curator-data-acquisition.txt deleted file mode 100644 index c51ccba8b..000000000 --- a/skills/nemotron-customize/context/curator-data-acquisition.txt +++ /dev/null @@ -1,2905 +0,0 @@ - -/Users/mromeijn/src/Curator -├── docs -│ ├── about -│ │ ├── concepts -│ │ │ ├── text -│ │ │ │ ├── _images -│ │ │ │ ├── data-acquisition-concepts.md * -│ │ │ │ └── data-loading-concepts.md * -│ │ │ ├── audio -│ │ │ ├── image -│ │ │ └── video -│ │ │ └── _images -│ │ └── release-notes -│ ├── curate-text -│ │ ├── load-data -│ │ │ ├── common-crawl.md * -│ │ │ ├── custom.md * -│ │ │ ├── index.md * -│ │ │ └── read-existing.md * -│ │ ├── process-data -│ │ │ ├── content-processing -│ │ │ ├── deduplication -│ │ │ ├── language-management -│ │ │ ├── quality-assessment -│ │ │ └── specialized-processing -│ │ ├── synthetic -│ │ │ └── nemotron-cc -│ │ └── tutorials -│ ├── _extensions -│ │ ├── ai_assistant -│ │ │ ├── assets -│ │ │ │ └── styles -│ │ │ ├── core -│ │ │ ├── integrations -│ │ │ └── ui -│ │ ├── content_gating -│ │ ├── json_output -│ │ │ ├── content -│ │ │ ├── core -│ │ │ └── processing -│ │ ├── rich_metadata -│ │ │ └── templates -│ │ └── search_assets -│ │ ├── modules -│ │ └── templates -│ ├── _images -│ ├── _templates -│ ├── admin -│ │ ├── deployment -│ │ │ └── slurm -│ │ └── integrations -│ ├── curate-audio -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── asr-inference -│ │ │ ├── audio-analysis -│ │ │ ├── quality-assessment -│ │ │ └── text-integration -│ │ └── tutorials -│ ├── curate-images -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── embeddings -│ │ │ └── filters -│ │ └── tutorials -│ ├── curate-video -│ │ ├── load-data -│ │ ├── process-data -│ │ └── tutorials -│ │ ├── _images -│ │ └── pipeline-customization -│ ├── get-started -│ └── reference -│ └── infrastructure -├── nemo_curator -│ ├── stages -│ │ ├── text -│ │ │ ├── download -│ │ │ │ ├── base -│ │ │ │ │ ├── download.py * + -│ │ │ │ │ ├── extract.py * + -│ │ │ │ │ └── stage.py * + -│ │ │ │ ├── common_crawl -│ │ │ │ │ └── stage.py * + -│ │ │ │ ├── arxiv -│ │ │ │ ├── html_extractors -│ │ │ │ │ └── utils -│ │ │ │ └── wikipedia -│ │ │ ├── classifiers -│ │ │ ├── deduplication -│ │ │ ├── embedders -│ │ │ ├── filters -│ │ │ │ ├── fasttext -│ │ │ │ ├── heuristic -│ │ │ │ │ ├── code -│ │ │ │ │ └── repetition -│ │ │ │ ├── histogram -│ │ │ │ └── token -│ │ │ ├── io -│ │ │ │ ├── reader -│ │ │ │ └── writer -│ │ │ ├── models -│ │ │ ├── modifiers -│ │ │ │ ├── fasttext -│ │ │ │ ├── string -│ │ │ │ └── unicode -│ │ │ ├── modules -│ │ │ └── utils -│ │ ├── audio -│ │ │ ├── advanced_pipelines -│ │ │ │ └── audio_data_filter -│ │ │ ├── alm -│ │ │ ├── datasets -│ │ │ │ ├── fleurs -│ │ │ │ └── readspeech -│ │ │ ├── filtering -│ │ │ │ ├── band_filter_module -│ │ │ │ └── sigmos_filter_module -│ │ │ │ └── third_party -│ │ │ │ └── sigmos -│ │ │ ├── inference -│ │ │ ├── io -│ │ │ ├── metrics -│ │ │ ├── postprocessing -│ │ │ ├── preprocessing -│ │ │ └── segmentation -│ │ │ └── speaker_separation_module -│ │ ├── deduplication -│ │ │ ├── exact -│ │ │ ├── fuzzy -│ │ │ │ └── lsh -│ │ │ ├── semantic -│ │ │ └── shuffle_utils -│ │ ├── image -│ │ │ ├── deduplication -│ │ │ ├── embedders -│ │ │ ├── filters -│ │ │ └── io -│ │ ├── interleaved -│ │ │ ├── filter -│ │ │ ├── io -│ │ │ │ ├── readers -│ │ │ │ └── writers -│ │ │ ├── pdf -│ │ │ │ └── nemotron_parse -│ │ │ └── utils -│ │ ├── math -│ │ │ ├── classifiers -│ │ │ ├── download -│ │ │ │ └── html_extractors -│ │ │ └── modifiers -│ │ ├── synthetic -│ │ │ ├── nemo_data_designer -│ │ │ └── nemotron_cc -│ │ │ └── nemo_data_designer -│ │ └── video -│ │ ├── caption -│ │ ├── clipping -│ │ ├── embedding -│ │ ├── filtering -│ │ ├── io -│ │ └── preview -│ ├── backends -│ │ ├── internal -│ │ │ └── raft -│ │ ├── ray_actor_pool -│ │ ├── ray_data -│ │ └── xenna -│ ├── config -│ │ └── text -│ ├── core -│ ├── metrics -│ ├── models -│ │ └── client -│ ├── pipeline -│ ├── tasks -│ └── utils -├── tutorials -│ ├── text -│ │ ├── download-and-extract -│ │ │ └── README.md * -│ │ ├── llama-nemotron-data-curation -│ │ │ ├── filters -│ │ │ ├── utils -│ │ │ ├── README.md * -│ │ │ └── main.py * + -│ │ ├── deduplication -│ │ │ ├── fuzzy -│ │ │ └── semantic -│ │ ├── distributed-data-classification -│ │ ├── gliner-pii-redaction -│ │ ├── megatron-tokenizer -│ │ ├── peft-curation -│ │ └── tinystories -│ ├── audio -│ │ ├── alm -│ │ ├── callhome_diar -│ │ ├── fleurs -│ │ ├── readspeech -│ │ └── single_speaker_filter -│ ├── image -│ │ └── getting-started -│ ├── interleaved -│ │ └── nemotron_parse_pdf -│ ├── math -│ ├── multimodal -│ ├── slurm -│ ├── synthetic -│ │ ├── nemo_data_designer -│ │ └── nemotron_cc -│ │ ├── example_data -│ │ └── nemo_data_designer -│ └── video -│ └── getting-started -├── .cursor -│ └── rules -├── .github -│ ├── actions -│ │ ├── build-container -│ │ └── test-template -│ ├── scripts -│ └── workflows -│ └── config -├── benchmarking -│ ├── data_prep -│ ├── runner -│ │ └── sinks -│ ├── scripts -│ └── tools -├── docker -│ └── common -├── fern -│ ├── assets -│ │ └── images -│ ├── components -│ └── versions -│ ├── v25.09 -│ │ └── pages -│ │ ├── about -│ │ │ ├── concepts -│ │ │ │ ├── audio -│ │ │ │ ├── image -│ │ │ │ ├── text -│ │ │ │ └── video -│ │ │ └── release-notes -│ │ ├── admin -│ │ │ ├── deployment -│ │ │ └── integrations -│ │ ├── api-reference -│ │ │ ├── executors -│ │ │ └── tasks -│ │ ├── curate-audio -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ │ ├── asr-inference -│ │ │ │ ├── audio-analysis -│ │ │ │ ├── quality-assessment -│ │ │ │ └── text-integration -│ │ │ └── tutorials -│ │ ├── curate-images -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ │ ├── embeddings -│ │ │ │ └── filters -│ │ │ └── tutorials -│ │ ├── curate-text -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ │ ├── content-processing -│ │ │ │ ├── deduplication -│ │ │ │ ├── language-management -│ │ │ │ ├── quality-assessment -│ │ │ │ └── specialized-processing -│ │ │ └── tutorials -│ │ ├── curate-video -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ └── tutorials -│ │ │ └── pipeline-customization -│ │ ├── get-started -│ │ └── reference -│ │ └── infrastructure -│ └── v26.02 -│ └── pages -│ ├── _images -│ ├── about -│ │ ├── concepts -│ │ │ ├── audio -│ │ │ ├── image -│ │ │ ├── text -│ │ │ │ └── _images -│ │ │ └── video -│ │ │ └── _images -│ │ └── release-notes -│ ├── admin -│ │ ├── deployment -│ │ │ └── slurm -│ │ └── integrations -│ ├── api-reference -│ │ ├── executors -│ │ └── tasks -│ ├── curate-audio -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── asr-inference -│ │ │ ├── audio-analysis -│ │ │ ├── quality-assessment -│ │ │ └── text-integration -│ │ └── tutorials -│ ├── curate-images -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── embeddings -│ │ │ └── filters -│ │ └── tutorials -│ ├── curate-text -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── content-processing -│ │ │ ├── deduplication -│ │ │ ├── language-management -│ │ │ ├── quality-assessment -│ │ │ └── specialized-processing -│ │ ├── synthetic -│ │ │ └── nemotron-cc -│ │ └── tutorials -│ ├── curate-video -│ │ ├── load-data -│ │ ├── process-data -│ │ └── tutorials -│ │ ├── _images -│ │ └── pipeline-customization -│ ├── get-started -│ └── reference -│ └── infrastructure -└── tests - ├── backends - │ ├── ray_actor_pool - │ └── ray_data - ├── config - ├── core - ├── fixtures - │ └── audio - │ └── alm - │ └── nested_manifests - │ ├── subdir_a - │ └── subdir_b - ├── metrics - ├── models - │ └── client - ├── pipelines - ├── stages - │ ├── audio - │ │ ├── advanced_pipelines - │ │ ├── alm - │ │ ├── datasets - │ │ ├── filtering - │ │ ├── inference - │ │ ├── io - │ │ ├── metrics - │ │ ├── postprocessing - │ │ ├── preprocessing - │ │ └── segmentation - │ ├── common - │ ├── deduplication - │ │ ├── exact - │ │ ├── fuzzy - │ │ ├── semantic - │ │ └── shuffle_utils - │ ├── image - │ │ ├── dedup - │ │ ├── embedders - │ │ ├── filters - │ │ └── io - │ ├── interleaved - │ │ ├── filter - │ │ ├── pdf - │ │ │ └── nemotron_parse - │ │ └── utils - │ ├── math_stages - │ │ ├── classifiers - │ │ ├── download - │ │ └── modifiers - │ ├── synthetic - │ │ ├── nemo_data_designer - │ │ └── nemotron_cc - │ │ └── nemo_data_designer - │ ├── text - │ │ ├── classifiers - │ │ ├── deduplication - │ │ ├── download - │ │ │ ├── arxiv - │ │ │ ├── base - │ │ │ ├── common_crawl - │ │ │ └── wikipedia - │ │ ├── embedders - │ │ ├── io - │ │ │ ├── reader - │ │ │ └── writer - │ │ ├── models - │ │ └── modules - │ └── video - │ ├── caption - │ │ └── fixtures - │ ├── clipping - │ ├── embedding - │ ├── filtering - │ ├── io - │ └── preview - ├── tasks - └── utils - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; selected files shown. - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/base.py -Imports: - - import contextlib - - import copy - - import time - - from abc import ABC, ABCMeta, abstractmethod - - from inspect import isabstract - - from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, final - - from loguru import logger - - from nemo_curator.stages.resources import Resources - - from nemo_curator.tasks import Task - - from nemo_curator.backends.base import NodeInfo, WorkerMetadata ---- -Classes: - - StageMeta - Methods: - - L46: def __new__(mcls, name, bases, namespace, **kwargs): - - ProcessingStage - Methods: - - L92: def _name(self) -> str: - - L97: def _resources(self) -> Resources: - - L102: def _batch_size(self) -> int | None: - - L106: def __init_subclass__(cls, **kwargs): - - L127: def num_workers(self) -> int | None: - - L131: def validate_input(self, task: Task) -> bool: - - L161: def process(self, task: X) -> Y | list[Y]: - - L171: def process_batch(self, tasks: list[X]) -> list[Y]: - - L201: def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None: - - L209: def setup(self, worker_metadata: WorkerMetadata | None = None) -> None: - - L217: def teardown(self) -> None: - - L222: def supports_batch_processing(self) -> bool: - - L230: def __repr__(self) -> str: - - L234: def inputs(self) -> tuple[list[str], list[str]]: - - L244: def outputs(self) -> tuple[list[str], list[str]]: - - L254: def xenna_stage_spec(self) -> dict[str, Any]: - - L262: def with_( - self, - name: str | None = None, - resources: Resources | None = None, - batch_size: int | None = None, - runtime_env: dict[str, Any] | None = None, - ) -> ProcessingStage: - - L293: def get_config(self) -> dict[str, Any]: - - L305: def ray_stage_spec(self) -> dict[str, Any]: - - L316: def _log_metrics(self, metrics: dict[str, float]) -> None: - - L327: def _log_metric(self, name: str, value: float) -> None: - - L331: def _time_metric(self, name: str) -> contextlib.AbstractContextManager[None]: - - L339: def _consume_custom_metrics(self) -> dict[str, float]: - Properties: - - _is_abstract_root - - name - - resources - - batch_size - - runtime_env - - CompositeStage - Methods: - - L359: def __init__(self): - - L362: def inputs(self) -> tuple[list[str], list[str]]: - - L366: def outputs(self) -> tuple[list[str], list[str]]: - - L371: def decompose(self) -> list[ProcessingStage]: - - L381: def with_(self, stage_with_dict: dict[str, Any]) -> CompositeStage: - - L387: def decompose_and_apply_with(self) -> list[ProcessingStage]: - - L391: def _apply_with_(self, stages: list[ProcessingStage]) -> list[ProcessingStage]: - - L419: def process(self, task: X) -> Y | list[Y]: - - L425: def get_description(self) -> str: - -Functions: - - L62: def get_stage_class(name: str) -> type[ProcessingStage]: - -Global vars: - - X - - Y - - _STAGE_REGISTRY ---- - - -File: /Users/mromeijn/src/Curator/nemo_curator/tasks/file_group.py -Imports: - - from dataclasses import dataclass, field - - from typing import Any - - from loguru import logger - - from .tasks import Task ---- -Classes: - - FileGroupTask - Methods: - - L33: def num_items(self) -> int: - - L37: def validate(self) -> bool: - Properties: - - reader_config - - data ---- - - - -File: /Users/mromeijn/src/Curator/docs/about/concepts/text/data-acquisition-concepts.md -```md ---- -description: "Core concepts for acquiring text data from remote sources including DocumentDownloader, DocumentIterator, and DocumentExtractor components" -categories: ["concepts-architecture"] -tags: ["data-acquisition", "remote-sources", "download", "extract", "distributed"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "concept" -modality: "text-only" ---- - -(about-concepts-text-data-acquisition)= - -# Data Acquisition Concepts - -This guide covers the core concepts for acquiring and processing text data from remote sources in NeMo Curator. Data acquisition focuses on downloading, extracting, and converting remote data sources into the `DocumentBatch` format for further processing. - -## Overview - -Data acquisition in NeMo Curator follows a three-stage architecture: - -1. **Generate URLs**: Discover and generate download URLs from minimal input -2. **Download**: Retrieve raw data files from remote sources -3. **Iterate** and **Extract**: Extract individual records from downloaded containers and convert raw content to clean, structured text - -This process transforms diverse remote data sources into a standardized `DocumentBatch` that can be used throughout the text curation pipeline. - -## Core Components - -The data acquisition framework consists of four abstract base classes that define the acquisition workflow: - -### URLGenerator - -Generates URLs for downloading from minimal input configuration. You need to override `generate_urls` which generates a bunch of URLs that user wants to download. - -**Example Implementation**: - -```python -from dataclasses import dataclass -from nemo_curator.stages.text.download import URLGenerator - -@dataclass -class CustomURLGenerator(URLGenerator): - def generate_urls(self): - # Custom URL generation logic - urls = [] - ... - return urls -``` - -### DocumentDownloader - -Connects to and downloads data from remote repositories. You must override `_get_output_filename` and `_download_to_path` which are called by an underlying function called `download` which tries to be idempotent. - -**Example Implementation**: - -```python -from nemo_curator.stages.text.download import DocumentDownloader - -class CustomDownloader(DocumentDownloader): - def __init__(self, download_dir: str): - super().__init__(download_dir=download_dir) - - def _get_output_filename(self, url: str) -> str: - # Custom logic to extract filename from URL - return url.split("/")[-1] - - def _download_to_path(self, url: str, path: str) -> tuple[bool, str | None]: - # Custom download logic - # Return (success_bool, error_message) - try: - # ... download implementation ... - return True, None - except Exception as e: - return False, str(e) -``` - -### DocumentIterator - -Extracts individual records from downloaded containers. You should only override `iterate` and `output_columns` where `iterate` must have logic to load the local file path and return bunch of documents. The `list[dict]` is finally considered to a Pandas DataFrame which is passed to Extractor. - -**Example Implementation**: - -```python -from collections.abc import Iterator -from typing import Any -from nemo_curator.stages.text.download import DocumentIterator - -class CustomIterator(DocumentIterator): - def __init__(self, log_frequency: int = 1000): - super().__init__() - self._log_frequency = log_frequency - - def iterate(self, file_path: str) -> Iterator[dict[str, Any]]: - # Custom iteration logic to load local file and return documents - for record in load_local_file_fn(file_path): - yield {"content": record_content, "metadata": record_metadata} - - def output_columns(self) -> list[str]: - return ["content", "metadata"] -``` - -### DocumentExtractor (Optional) - -DocumentExtractor works on a Pandas DataFrame and is optional. - -**Example Implementation**: - -```python -from typing import Any -from nemo_curator.stages.text.download import DocumentExtractor - -class CustomExtractor(DocumentExtractor): - def __init__(self): - super().__init__() - - def extract(self, record: dict[str, str]) -> dict[str, Any] | None: - # Custom extraction logic - cleaned_text = clean_content(record["content"]) - detected_lang = detect_language(cleaned_text) - return {"text": cleaned_text, "language": detected_lang} - - def input_columns(self): - return ["content", "metadata"] - - def output_columns(self): - return ["text", "language"] -``` - -## Supported Data Sources - -NeMo Curator provides built-in support for major public text datasets: - -::::{grid} 2 2 2 3 -:gutter: 2 - -:::{grid-item-card} {octicon}`globe;1.5em;sd-mr-1` Common Crawl -:link: text-load-data-common-crawl -:link-type: ref - -Download and extract web archive data from Common Crawl -+++ -{bdg-secondary}`web-scale` {bdg-secondary}`multilingual` -::: - -:::{grid-item-card} {octicon}`typography;1.5em;sd-mr-1` ArXiv -:link: text-load-data-arxiv -:link-type: ref - -Download and extract scientific papers from arXiv -+++ -{bdg-secondary}`academic` {bdg-secondary}`scientific` -::: - -:::{grid-item-card} {octicon}`book;1.5em;sd-mr-1` Wikipedia -:link: text-load-data-wikipedia -:link-type: ref - -Download and extract Wikipedia articles from Wikipedia dumps -+++ -{bdg-secondary}`encyclopedic` {bdg-secondary}`structured` -::: - -:::{grid-item-card} {octicon}`gear;1.5em;sd-mr-1` Custom Data Sources -:link: text-load-data-custom -:link-type: ref - -Implement a download and extract pipeline for a custom data source -+++ -{bdg-secondary}`extensible` {bdg-secondary}`specialized` -::: - -:::: - -## Integration with Pipeline Architecture - -The data acquisition process seamlessly integrates with NeMo Curator's pipeline-based architecture. The `DocumentDownloadExtractStage` handles parallel processing through the distributed computing framework. - -### Acquisition Workflow - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.download import DocumentDownloadExtractStage -from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter -from nemo_curator.stages.base import ProcessingStage - -# Create composite stage -class CustomDownloadExtractStage(DocumentDownloadExtractStage): - def __init__( - self, - download_dir: str = "./custom_downloads", - url_limit: int | None = None, - record_limit: int | None = None, - add_filename_column: bool | str = True, - ): - # Create the URL generator - self.url_generator = CustomURLGenerator() - - # Create the downloader - self.downloader = CustomDownloader(download_dir=download_dir) - - # Create the iterator - self.iterator = CustomIterator() - - # Create the extractor - self.extractor = CustomExtractor() - - # Initialize the parent composite stage - super().__init__( - url_generator=self.url_generator, - downloader=self.downloader, - iterator=self.iterator, - extractor=self.extractor, - url_limit=url_limit, - record_limit=record_limit, - add_filename_column=add_filename_column, - ) - self.name = "custom_pipeline" - - def decompose(self) -> list[ProcessingStage]: - """Decompose this composite stage into its constituent stages.""" - return self.stages - - def get_description(self) -> str: - """Get a description of this composite stage.""" - return "Custom pipeline" - -# Initialize Ray client -ray_client = RayClient() -ray_client.start() - -# Define acquisition pipeline -pipeline = Pipeline(name="data_acquisition") - -# Create download and extract stage with custom components -custom_download_extract_stage = CustomDownloadExtractStage(...) -pipeline.add_stage(custom_download_extract_stage) - -# Write the results -pipeline.add_stage(JsonlWriter(...)) - -# Execute acquisition pipeline -results = pipeline.run() - -# Stop Ray client -ray_client.stop() -``` - -## Performance Optimization - -### Parallel Processing - -Data acquisition leverages distributed computing frameworks for scalable processing: - -- **Parallel Downloads**: Each URL in the generated list downloads through separate workers -- **Concurrent Extraction**: Files process in parallel across workers -- **Memory Management**: Streaming processing for large files - -## Integration with Data Loading - -Data acquisition produces a standardized output that integrates seamlessly with Curator's {ref}`Data Loading Concepts `: - -```{note} -Data acquisition includes basic content-level deduplication during extraction (such as removing duplicate HTML content within individual web pages). This is separate from the main deduplication pipeline stages (exact, fuzzy, and semantic deduplication) that operate on the full dataset after acquisition. -``` - -```python -from nemo_curator.stages.text.io.writer import ParquetWriter - -# Create acquisition pipeline with all stages including writer -acquisition_pipeline = Pipeline(name="data_acquisition") -# ... add acquisition stages ... - -# Add writer to save results directly -writer = ParquetWriter(path="acquired_data/") -acquisition_pipeline.add_stage(writer) - -# Run pipeline to acquire and save data in one execution -results = acquisition_pipeline.run() - -# Later: Load using pipeline-based data loading -from nemo_curator.stages.text.io.reader import ParquetReader - -load_pipeline = Pipeline(name="load_acquired_data") -reader = ParquetReader(file_paths="acquired_data/") -load_pipeline.add_stage(reader) -``` - -This enables you to: - -- **Separate acquisition from processing** for better workflow management -- **Cache acquired data** to avoid re-downloading -- **Mix acquired and local data** in the same processing pipeline -- **Use standard loading patterns** regardless of data origin - -``` - -File: /Users/mromeijn/src/Curator/docs/about/concepts/text/data-loading-concepts.md -```md ---- -description: "Core concepts for loading and managing text datasets using pipeline-based readers and DocumentBatch tasks" -categories: ["concepts-architecture"] -tags: ["data-loading", "document-dataset", "parallel-dataset", "distributed", "gpu-accelerated", "local-files"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "concept" -modality: "text-only" ---- - -(about-concepts-text-data-loading)= - -# Data Loading Concepts - -This guide covers the core concepts for loading and managing text data from local files in NVIDIA NeMo Curator. - -## Pipeline-Based Data Loading - -NeMo Curator uses a **pipeline-based architecture** for handling large-scale text data processing. Data flows through processing stages that transform data, enabling distributed processing of local files. - -The system provides two primary readers for text data: - -- **JsonlReader** - For JSON Lines format files (most common). -- **ParquetReader** - For columnar Parquet files (better performance for large datasets with PyArrow optimization). - -Both readers support optimization through: - -- **Field selection** - Reading specified columns to reduce memory usage. -- **Partitioning control** - Using `blocksize` or `files_per_partition` to optimize `DocumentBatch` sizes during distributed processing. -- **Recommended block size** - Use ~128MB for optimal object store performance with smaller data chunks. - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader, ParquetReader - -# Initialize Ray client -ray_client = RayClient() -ray_client.start() - -# Basic usage with optimization -pipeline = Pipeline(name="data_processing") - -# Define file type (example) -file_type = "jsonl" # or "parquet" based on your data - -if file_type == "jsonl": - # JSONL reader with field selection and partitioning - jsonl_reader = JsonlReader( - file_paths="/path/to/jsonl_directory", - blocksize="128MB", # Recommended for object store optimization - fields=["text", "id"] # Column selection for efficiency - ) - pipeline.add_stage(jsonl_reader) -else: - # Parquet reader with performance optimization - parquet_reader = ParquetReader( - file_paths="/path/to/parquet_directory", - files_per_partition=4, # Alternative to blocksize - fields=["text", "metadata"] - ) - pipeline.add_stage(parquet_reader) - -# Execute pipeline -results = pipeline.run() - -# Stop Ray client -ray_client.stop() -``` - -## Optimization Strategies - -### Partitioning Control - -:::{note} -**Partitioning Strategy**: Specify either `files_per_partition` or `blocksize`. If `files_per_partition` is provided, `blocksize` is ignored. -::: - -```python -# Option 1: Size-based partitioning (recommended) -reader = JsonlReader( - file_paths="/path/to/data", - blocksize="128MB" # Optimal for object store performance -) - -# Option 2: File count-based partitioning -reader = JsonlReader( - file_paths="/path/to/data", - files_per_partition=16 # Match your cluster size -) -``` - -### Performance Recommendations - -- **Block size and files per partition**: Use ~128MB for optimal performance. Very large batches lead to memory overhead when passing data between stages through the object store, while very small batches induce overhead from processing many more tasks. We recommend ~128MB as a good balance. Try to avoid going below 32MB or above 1GiB partition sizes. -- **Field selection**: Specify the `fields` parameter to read only the required columns. -- **Engine choice**: ParquetReader defaults to PyArrow with `dtype_backend="pyarrow"` for optimal performance and memory efficiency. If you encounter compatibility issues with certain data types or schemas, you can override these defaults through `read_kwargs`: - ```python - # Remove PyArrow dtype backend if compatibility issues arise - reader = ParquetReader( - file_paths="data.parquet", - read_kwargs={"dtype_backend": "numpy_nullable"} # Falls back to Pandas default behavior - ) - ``` - -### Memory Tips - -:::{warning} -If you set the `blocksize` parameter to a size smaller than your input file size(s), Curator does not split the input files and instead attempts to read each file in full. To avoid out-of-memory issues, use the helper script described below. -::: - -If any of your individual JSONL or Parquet files are greater than 2 GiB, we recommend using the `nemo_curator/utils/split_large_files.py` helper script to split them into more manageable sizes and prevent out-of-memory issues. You can run it with: - -```bash -python nemo_curator/utils/split_large_files.py --input-path "/path/to/input/dir" --file-type "parquet" --output-path "/path/to/output/dir" --target-size-mb 128 -``` - -It supports splitting JSONL or Parquet files as specified by the `--file-type` argument. - -Another option is running file splitting within your existing script. For example, you can split large JSONL files with: - -```python -import ray -from nemo_curator.core.client import RayClient -from nemo_curator.utils.split_large_files import split_jsonl_file_by_size - -# Start Ray client as usual -ray_client = RayClient() -ray_client.start() - -input_files = [] # your list of input jsonl files - -ray.get( - [ - split_jsonl_file_by_size.remote( - input_file=f, - output_path="/path/to/output/dir", - target_size_mb=128, - ) - for f in input_files - ] -) - -# initialize your Curator pipeline with JsonlReader, etc. -``` - -Similarly for Parquet files: - -```python -import ray -from nemo_curator.core.client import RayClient -from nemo_curator.utils.split_large_files import split_parquet_file_by_size - -# Start Ray client as usual -ray_client = RayClient() -ray_client.start() - -input_files = [] # your list of input parquet files - -ray.get( - [ - split_parquet_file_by_size.remote( - input_file=f, - output_path="/path/to/output/dir", - target_size_mb=128, - ) - for f in input_files - ] -) - -# initialize your Curator pipeline with ParquetReader, etc. -``` - -## Data Export Options - -NeMo Curator provides flexible export options for processed datasets: - -```python -from nemo_curator.stages.text.io.writer import JsonlWriter, ParquetWriter - -# Add writers to pipeline after processing stages -pipeline.add_stage(JsonlWriter(path="output_directory/")) -# or -pipeline.add_stage(ParquetWriter(path="output_directory/")) -``` - -## Common Loading Patterns - -### Multi-Source Data - -```python -# Combine multiple directories with same reader type -reader = JsonlReader(file_paths=[ - "dataset_v1/", - "dataset_v2/", - "additional_data/" -]) -``` - -:::{note} -You cannot combine different reader types (`JsonlReader` + `ParquetReader`) in the same pipeline stage. For different file types, you would need to create a new `CustomReader` from the underlying `BaseReader` that can read based on different extensions provided. -::: - -## Remote Data Sources - -This page focuses on loading text data from **local files** using `JsonlReader` and `ParquetReader`. Both readers support remote storage locations (Amazon S3, Azure) when you provide remote file paths. - -For downloading and processing data from **remote sources** like ArXiv, Common Crawl, and Wikipedia, refer to the {ref}`Data Acquisition Concepts ` page which covers: - -- **URLGenerator, DocumentDownloader, DocumentIterator, DocumentExtractor** components. -- **Built-in support** for Common Crawl, ArXiv, Wikipedia, and custom sources. -- **Integration patterns** with pipeline-based processing. -- **Configuration and scaling** strategies. - -The data acquisition process produces standardized output that integrates seamlessly with the pipeline-based loading concepts described on this page. - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/index.md -```md ---- -description: "Load text data from Common Crawl, Wikipedia, and custom datasets using Curator." -categories: ["workflows"] -tags: ["data-loading", "arxiv", "common-crawl", "wikipedia", "custom-data", "distributed", "ray"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "workflow" -modality: "text-only" ---- - -(text-load-data)= - -# Download Data - -Load text data from ArXiv, Common Crawl, Wikipedia, and custom sources using Curator. - -Curator provides a task-centric pipeline for downloading and processing large-scale public text datasets. It runs on Ray and converts raw formats like Common Crawl's `.warc.gz` into JSONL. - -## How it Works - -Curator uses a {ref}`4-step pipeline pattern ` where data flows through stages as tasks. Each step uses a `ProcessingStage` that transforms tasks according to Curator's {ref}`pipeline-based architecture `. - -Data sources provide composite stages that combine these steps into complete download-and-extract pipelines, producing `DocumentBatch` tasks for further processing. - -::::{tab-set} - -:::{tab-item} Python - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.download import CommonCrawlDownloadExtractStage -from nemo_curator.stages.text.io.writer import JsonlWriter - -# Initialize Ray client -ray_client = RayClient() -ray_client.start() - -# Create a pipeline for downloading Common Crawl data -pipeline = Pipeline( - name="common_crawl_download", - description="Download and process Common Crawl web archives" -) - -# Add data loading stage -cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-50", - end_snapshot="2020-50", - download_dir="/tmp/cc_downloads", - crawl_type="main", - url_limit=10 # Limit for testing -) -pipeline.add_stage(cc_stage) - -# Add writer stage to save as JSONL -writer = JsonlWriter(path="/output/folder") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() - -# Stop Ray client -ray_client.stop() -``` - -::: - -:::: - ---- - -## Data Sources & File Formats - -Load data from public datasets and custom data sources using Curator stages. - -::::{grid} 1 1 1 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`file;1.5em;sd-mr-1` Read Existing Data -:link: text-load-data-read-existing -:link-type: ref -Read existing JSONL and Parquet datasets using Curator's reader stages -+++ -{bdg-secondary}`jsonl` -{bdg-secondary}`parquet` -::: - -:::{grid-item-card} {octicon}`download;1.5em;sd-mr-1` Common Crawl -:link: text-load-data-common-crawl -:link-type: ref -Download and extract web archive data from Common Crawl -+++ -{bdg-secondary}`web-data` -{bdg-secondary}`warc` -{bdg-secondary}`html-extraction` -::: - -:::{grid-item-card} {octicon}`download;1.5em;sd-mr-1` Wikipedia -:link: text-load-data-wikipedia -:link-type: ref -Download and extract Wikipedia articles from Wikipedia dumps -+++ -{bdg-secondary}`articles` -{bdg-secondary}`multilingual` -{bdg-secondary}`xml-dumps` -::: - -:::{grid-item-card} {octicon}`download;1.5em;sd-mr-1` Custom Data Sources -:link: text-load-data-custom -:link-type: ref -Implement a download and extract pipeline for a custom data source -+++ -{bdg-secondary}`jsonl` -{bdg-secondary}`parquet` -{bdg-secondary}`file-partitioning` -::: - -:::: - -```{toctree} -:maxdepth: 4 -:titlesonly: -:hidden: - -Read Existing Data -arxiv -common-crawl -wikipedia -Custom Data Sources -``` - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/common-crawl.md -```md ---- -description: "Download and extract text from Common Crawl web archives using Curator." -categories: ["how-to-guides"] -tags: ["common-crawl", "web-data", "warc", "language-detection", "distributed", "html-extraction", "pipeline"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "how-to" -modality: "text-only" ---- - -(text-load-data-common-crawl)= - -# Common Crawl - -Download and extract text from Common Crawl snapshots using Curator. - -Common Crawl provides petabytes of web data collected over years of web crawling. The data uses a compressed web archive format (`.warc.gz`), which requires processing to extract useful text for language model training. - -## How it Works - -Curator's Common Crawl processing pipeline consists of four sequential stages: - -1. **URL Generation**: Generates WARC file URLs from Common Crawl's index for the specified snapshot range -2. **Download**: Downloads the compressed WARC files from Common Crawl's servers (optionally using S3 for faster downloads) -3. **Iteration**: Extracts individual records from WARC files and decodes HTML content -4. **Extraction**: Performs language detection and extracts clean text using configurable HTML extraction algorithms - -The pipeline outputs structured data that you can write to JSONL or Parquet files for further processing. - -## Before You Start - -Choose your download method and ensure you have the prerequisites: - -- HTTPS downloads (default): No AWS account required. -- S3 downloads (set `use_aws_to_download=True`): - - An AWS account with credentials configured (profile, environment, or instance role). - - Common Crawl's S3 access uses Requester Pays; you incur charges for requests and data transfer. - - `s5cmd` installed for fast S3 listing and copy operations: - -```bash -# Install s5cmd for faster S3 downloads -pip install s5cmd -``` - ---- - -## Usage - -Here's how to create and run a Common Crawl processing pipeline: - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.download import CommonCrawlDownloadExtractStage -from nemo_curator.stages.text.io.writer import JsonlWriter - -def main(): - # Initialize Ray client - ray_client = RayClient() - ray_client.start() - - # Create pipeline - pipeline = Pipeline( - name="common_crawl_pipeline", - description="Download and process Common Crawl data" - ) - - # Add Common Crawl processing stage - cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-50", # YYYY-WW format for CC-MAIN - end_snapshot="2020-50", - download_dir="./cc_downloads", - crawl_type="main", # or "news" - use_aws_to_download=True, # Faster S3 downloads (requires s5cmd) - url_limit=10, # Limit number of WARC files for testing - record_limit=1000, # Limit records per WARC file - ) - pipeline.add_stage(cc_stage) - - # Add output writer stage - writer = JsonlWriter("./cc_output") - pipeline.add_stage(writer) - - # Run pipeline - results = pipeline.run() - - # Stop Ray client - ray_client.stop() - -if __name__ == "__main__": - main() -``` - -For executor options and configuration, refer to {ref}`reference-execution-backends`. - -### Writing to Parquet - -To write to Parquet files instead of JSONL, use `ParquetWriter`: - -```python -from nemo_curator.stages.text.io.writer import ParquetWriter - -# Replace the JSONL writer with ParquetWriter -writer = ParquetWriter("./cc_output_parquet") -pipeline.add_stage(writer) -``` - -### Parameters - -```{list-table} CommonCrawlDownloadExtractStage Parameters -:header-rows: 1 -:widths: 25 20 35 20 - -* - Parameter - - Type - - Description - - Default -* - `start_snapshot` - - str - - First snapshot to include (format: "YYYY-WW" for main, "YYYY-MM" for news). Not every year and week has a snapshot; refer to the official list at [https://data.commoncrawl.org/](https://data.commoncrawl.org/). - - Required -* - `end_snapshot` - - str - - Last snapshot to include (same format as `start_snapshot`). Ensure your range includes at least one valid snapshot. - - Required -* - `download_dir` - - str - - Directory to store downloaded WARC files - - Required -* - `crawl_type` - - Literal["main", "news"] - - Whether to use CC-MAIN or CC-NEWS dataset - - "main" -* - `html_extraction` - - HTMLExtractorAlgorithm | str | None - - Text extraction algorithm to use. Defaults to `JusTextExtractor()` if not specified. - - JusTextExtractor() if not specified -* - `html_extraction_kwargs` - - dict | None - - Additional arguments for the HTML extractor. Ignored when `html_extraction` is a concrete extractor object (for example, `JusTextExtractor()`); pass kwargs to the extractor constructor instead. When `html_extraction` is a string ("justext", "resiliparse", or "trafilatura"), kwargs are forwarded. - - None -* - `stop_lists` - - dict[str, frozenset[str]] | None - - Language-specific stop words for text quality assessment. If not provided, Curator uses jusText defaults with additional support for Thai, Chinese, and Japanese languages. - - None -* - `use_aws_to_download` - - bool - - Use S3 downloads via s5cmd instead of HTTPS (requires s5cmd installation) - - False -* - `verbose` - - bool - - Enable verbose logging for download operations - - False -* - `url_limit` - - int | None - - Maximum number of WARC files to download (useful for testing) - - None -* - `record_limit` - - int | None - - Maximum number of records to extract per WARC file - - None -* - `add_filename_column` - - bool | str - - Whether to add source filename column to output; if str, uses it as the column name (default name: "file_name") - - True -``` - -## Output Format - -The pipeline processes Common Crawl data through several stages, ultimately producing structured documents. The extracted text includes the following fields: - -```json -{ - "url": "http://example.com/page.html", - "warc_id": "a515a7b6-b6ec-4bed-998b-8be2f86f8eac", - "source_id": "CC-MAIN-20201123153826-20201123183826-00000.warc.gz", - "language": "ENGLISH", - "text": "Extracted web page content..." -} -``` - -```{list-table} Output Fields -:header-rows: 1 -:widths: 20 80 - -* - Field - - Description -* - `url` - - Original URL of the web page -* - `warc_id` - - Unique identifier for the WARC record -* - `source_id` - - Name of the source WARC file -* - `language` - - Detected language of the content (e.g., "ENGLISH", "SPANISH") -* - `text` - - Extracted and cleaned text content -``` - -If you enable `add_filename_column`, the output includes an extra field `file_name` (or your custom column name). - -## Customization Options - -### HTML Text Extraction Algorithms - -Curator supports several HTML text extraction algorithms: - -```{list-table} Available HTML Extractors -:header-rows: 1 -:widths: 30 70 - -* - Extractor - - Library -* - `JusTextExtractor` - - [jusText](https://github.com/miso-belica/jusText) -* - `ResiliparseExtractor` - - [Resiliparse](https://github.com/chatnoir-eu/chatnoir-resiliparse) -* - `TrafilaturaExtractor` - - [Trafilatura](https://trafilatura.readthedocs.io/) -``` - -#### Configuring HTML Extractors - -```python -from nemo_curator.stages.text.download.html_extractors import ResiliparseExtractor -from nemo_curator.stages.text.download.html_extractors import TrafilaturaExtractor - -# Use Resiliparse for extraction -cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-50", - end_snapshot="2020-50", - download_dir="./downloads", - html_extraction=ResiliparseExtractor( - required_stopword_density=0.25, - main_content=True - ) -) - -# Or use Trafilatura with custom parameters -cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-50", - end_snapshot="2020-50", - download_dir="./downloads", - html_extraction=TrafilaturaExtractor( - min_extracted_size=200, - max_repetitions=3 - ) -) -``` - -### Language Processing - -You can customize language detection and extraction by providing stop words for different languages: - -```python -# Define custom stop words for specific languages -stop_lists = { - "ENGLISH": frozenset(["the", "and", "is", "in", "for", "where", "when", "to", "at"]), - "SPANISH": frozenset(["el", "la", "de", "que", "y", "en", "un", "es", "se", "no"]) -} - -cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-50", - end_snapshot="2020-50", - download_dir="./downloads", - stop_lists=stop_lists -) -``` - -## Advanced Usage - -### Processing CC-NEWS Data - -For Common Crawl News data, use the `news` crawl type with month-based snapshots: - -```python -cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-08", # YYYY-MM format for CC-NEWS - end_snapshot="2020-10", - download_dir="./news_downloads", - crawl_type="news" # Use CC-NEWS instead of CC-MAIN -) -``` - -See [https://data.commoncrawl.org/crawl-data/CC-NEWS/index.html](https://data.commoncrawl.org/crawl-data/CC-NEWS/index.html) for more information. - -### Large-Scale Processing - -For production workloads, consider these optimizations: - -```python -cc_stage = CommonCrawlDownloadExtractStage( - start_snapshot="2020-50", - end_snapshot="2020-50", - download_dir="/fast_storage/cc_downloads", - use_aws_to_download=True, # Faster S3 downloads - verbose=False, # Reduce logging overhead - # Remove limits for full processing - # url_limit=None, - # record_limit=None -) -``` - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/custom.md -```md ---- -description: "Create custom data loading pipelines using Curator." -categories: ["how-to-guides"] -tags: ["custom-data", "stages", "pipelines", "data-loading"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "advanced" -content_type: "how-to" -modality: "text-only" ---- - -(text-load-data-custom)= - -# Custom Data Loading - -Create custom data loading pipelines using Curator. This guide shows how to build modular stages that run on Curator's distributed processing. - -## How It Works - -Curator uses the same **3-step pipeline pattern** described in {ref}`Data Acquisition Concepts ` for custom data loading. Each step uses an abstract base class with corresponding processing stages that compose into pipelines. - ---- - -## Architecture Overview - -For detailed information about the core components and data flow, see {ref}`Data Acquisition Concepts ` and {ref}`Data Loading Concepts `. - ---- - -## Implementation Guide - -### 1. Create Directory Structure - -```text -your_data_source/ -├── __init__.py -├── stage.py # Main composite stage -├── url_generation.py # URL generation logic -├── download.py # Download implementation -├── iterator.py # File iteration logic -└── extract.py # Data extraction logic (optional) -``` - -### 2. Build Core Components - -#### URL Generator (`url_generation.py`) - -```python -from dataclasses import dataclass -from nemo_curator.stages.text.download import URLGenerator - -@dataclass -class CustomURLGenerator(URLGenerator): - def generate_urls(self) -> list[str]: - """Generate list of URLs to download.""" - # Your URL generation logic here - return [ - "https://example.com/dataset1.zip", - "https://example.com/dataset2.zip", - ] -``` - -#### Document Downloader (`download.py`) - -```python -from nemo_curator.stages.text.download import DocumentDownloader - -class CustomDownloader(DocumentDownloader): - def __init__(self, download_dir: str): - super().__init__(download_dir=download_dir) - - def _get_output_filename(self, url: str) -> str: - """Extract filename from URL.""" - return url.split("/")[-1] - - def _download_to_path(self, url: str, path: str) -> tuple[bool, str | None]: - """Download file from URL to local path.""" - # Custom download logic - # Return (success_bool, error_message) - try: - # ... download implementation ... - return True, None - except Exception as e: - return False, str(e) -``` - -#### Document Iterator (`iterator.py`) - -```python -import json -from collections.abc import Iterator -from typing import Any -from nemo_curator.stages.text.download import DocumentIterator - -class CustomIterator(DocumentIterator): - def __init__(self, log_frequency: int = 1000): - super().__init__() - self._log_frequency = log_frequency - - def iterate(self, file_path: str) -> Iterator[dict[str, Any]]: - """Iterate over records in a file.""" - # Custom iteration logic to load local file and return documents - for record in load_local_file_fn(file_path): - yield {"content": record_content, "metadata": record_metadata, "id": record_id} - - def output_columns(self) -> list[str]: - """Define output columns.""" - return ["content", "metadata", "id"] -``` - -#### Document Extractor (`extract.py`) - -```python -from typing import Any -from nemo_curator.stages.text.download import DocumentExtractor - -class CustomExtractor(DocumentExtractor): - def __init__(self): - super().__init__() - - def extract(self, record: dict[str, str]) -> dict[str, Any] | None: - """Transform raw record to final format.""" - # Skip invalid records - if not record.get("content"): - return None - - # Extract and clean text - cleaned_text = self._clean_text(record["content"]) - - # Generate unique ID if not present - doc_id = record.get("id", self._generate_id(cleaned_text)) - - return { - "text": cleaned_text, - "id": doc_id, - "source": record.get("metadata", {}).get("source", "unknown") - } - - def input_columns(self) -> list[str]: - return ["content", "metadata", "id"] - - def output_columns(self) -> list[str]: - return ["text", "id", "source"] - - def _clean_text(self, text: str) -> str: - """Clean and normalize text.""" - # Your text cleaning logic here - return text.strip() - - def _generate_id(self, text: str) -> str: - """Generate unique ID for text.""" - import hashlib - return hashlib.md5(text.encode()).hexdigest()[:16] -``` - -### 3. Create Composite Stage (`stage.py`) - -```python -from nemo_curator.stages.text.download import DocumentDownloadExtractStage -from nemo_curator.stages.base import ProcessingStage -from .url_generation import CustomURLGenerator -from .download import CustomDownloader -from .iterator import CustomIterator -from .extract import CustomExtractor - -class CustomDataStage(DocumentDownloadExtractStage): - """Custom data loading stage combining all components.""" - - def __init__( - self, - download_dir: str = "./custom_downloads", - url_limit: int | None = None, - record_limit: int | None = None, - add_filename_column: bool | str = True, - ): - self.url_generator = CustomURLGenerator() - self.downloader = CustomDownloader(download_dir=download_dir) - self.iterator = CustomIterator() - self.extractor = CustomExtractor() - - # Initialize the parent composite stage - super().__init__( - url_generator=self.url_generator, - downloader=self.downloader, - iterator=self.iterator, - extractor=self.extractor, # Optional - remove if not needed - url_limit=url_limit, - record_limit=record_limit, - add_filename_column=add_filename_column, - ) - self.name = "custom_data" - - def decompose(self) -> list[ProcessingStage]: - """Decompose this composite stage into its constituent stages.""" - return self.stages - - def get_description(self) -> str: - """Get a description of this composite stage.""" - return "Custom data" -``` - ---- - -## Usage Examples - -### Basic Pipeline - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from your_data_source.stage import CustomDataStage -from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter - -def main(): - # Initialize Ray client - ray_client = RayClient() - ray_client.start() - - # Create pipeline - pipeline = Pipeline( - name="custom_data_pipeline", - description="Load and process custom dataset" - ) - - # Create custom data loading stage - data_stage = CustomDataStage(...) - - pipeline.add_stage(data_stage) - - # Save the results to JSONL - pipeline.add_stage(JsonlWriter(...)) - - # Run pipeline - print("Starting pipeline...") - results = pipeline.run() - - # Stop Ray client - ray_client.stop() - -if __name__ == "__main__": - main() -``` - -For executor options and configuration, refer to {ref}`reference-execution-backends`. - ---- - -## Parameters Reference - -```{list-table} Custom Data Loading Parameters -:header-rows: 1 -:widths: 20 20 40 20 - -* - Parameter - - Type - - Description - - Default -* - `url_generator` - - URLGenerator - - Custom URL generation implementation - - Required -* - `downloader` - - DocumentDownloader - - Custom download implementation - - Required -* - `iterator` - - DocumentIterator - - Custom file iteration implementation - - Required -* - `extractor` - - DocumentExtractor | None - - Optional extraction/transformation step - - None -* - `url_limit` - - int | None - - Maximum number of URLs to process - - None -* - `record_limit` - - int | None - - Maximum records per file - - None -* - `add_filename_column` - - bool | str - - Add filename column to output; if str, uses it as the column name (default name: "file_name") - - True -``` - ---- - -## Output Format - -Processed data flows through the pipeline as `DocumentBatch` tasks containing Pandas DataFrames or PyArrow Tables: - -### Example Output Schema - -```python -{ - "text": "This is the processed document text", - "id": "unique-document-id", - "source": "example.com", - "file_name": "dataset1.jsonl" # If add_filename_column=True (default column name) -} -``` - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/read-existing.md -```md ---- -description: "Read existing JSONL and Parquet datasets using Curator's reader stages." -categories: ["how-to-guides"] -tags: ["jsonl", "parquet", "data-loading", "reader", "pipelines"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "beginner" -content_type: "how-to" -modality: "text-only" ---- - -(text-load-data-read-existing)= - -# Read Existing Data - -Use Curator's `JsonlReader` and `ParquetReader` to read existing datasets into a pipeline, then optionally add processing stages. - -::::{tab-set} - -:::{tab-item} JSONL Reader -:sync: jsonl - -## Example: Read JSONL and Filter - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.filters import ScoreFilter -from nemo_curator.stages.text.filters.heuristic import WordCountFilter - -# Initialize Ray client -ray_client = RayClient() -ray_client.start() - -# Create pipeline for processing existing JSONL files -pipeline = Pipeline(name="jsonl_data_processing") - -# Read JSONL files -reader = JsonlReader( - file_paths="/path/to/data", - files_per_partition=4, - fields=["text", "url"] # Only read specific columns -) -pipeline.add_stage(reader) - -# Add filtering stage -word_filter = ScoreFilter( - filter_obj=WordCountFilter(min_words=50, max_words=1000), - text_field="text" -) -pipeline.add_stage(word_filter) - -# Add more stages to pipeline... - -# Execute pipeline -results = pipeline.run() - -# Stop Ray client -ray_client.stop() -``` - -::: - -:::{tab-item} Parquet Reader -:sync: parquet - -## Example: Read Parquet and Filter - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import ParquetReader -from nemo_curator.stages.text.filters import ScoreFilter -from nemo_curator.stages.text.filters.heuristic import WordCountFilter - -# Initialize Ray client -ray_client = RayClient() -ray_client.start() - -# Create pipeline for processing existing Parquet files -pipeline = Pipeline(name="parquet_data_processing") - -# Read Parquet files with PyArrow engine -reader = ParquetReader( - file_paths="/path/to/data", - files_per_partition=4, - fields=["text", "metadata"] # Only read specific columns -) -pipeline.add_stage(reader) - -# Add filtering stage -word_filter = ScoreFilter( - filter_obj=WordCountFilter(min_words=50, max_words=1000), - text_field="text" -) -pipeline.add_stage(word_filter) - -# Add more stages to pipeline... - -# Execute pipeline -results = pipeline.run() - -# Stop Ray client -ray_client.stop() -``` - -::: - -:::: - -## Reader Configuration - -### Common Parameters - -Both `JsonlReader` and `ParquetReader` support these configuration options: - -```{list-table} -:header-rows: 1 -:widths: 20 20 40 20 - -* - Parameter - - Type - - Description - - Default -* - `file_paths` - - str | list[str] - - File paths or glob patterns to read - - Required -* - `files_per_partition` - - int | None - - Number of files per partition. Overrides `blocksize` if both are provided. - - None -* - `blocksize` - - int | str | None - - Target partition size (e.g., "128MB"). Ignored if `files_per_partition` is provided. - - None -* - `fields` - - list[str] | None - - Column names to read (column selection) - - None (all columns) -* - `read_kwargs` - - dict[str, Any] | None - - Extra arguments for the underlying reader - - None -``` - -### Parquet-Specific Features - -`ParquetReader` provides these optimizations: - -- **PyArrow Engine**: Uses `pyarrow` engine by default for better performance. -- **Storage Options**: Supports cloud storage through `storage_options` in `read_kwargs`. -- **Schema Handling**: Automatic schema inference and validation. -- **Columnar Efficiency**: Optimized for reading specific columns. - -### Performance Tips - -- Use the `fields` parameter to read only the required columns for better performance. -- Set `files_per_partition` based on your cluster size and memory constraints. -- Use the `blocksize` parameter for fine-grained control over partition sizes. - -### Memory Tips - -:::{warning} -If you set the `blocksize` parameter to a size smaller than your input file size(s), Curator does not split the input files and instead attempts to read each file in full. To avoid out-of-memory issues, use the helper script described below. -::: - -If any of your individual JSONL or Parquet files are greater than 2 GiB, we recommend using the `nemo_curator/utils/split_large_files.py` helper script to split them into more manageable sizes and prevent out-of-memory issues. You can run it with: - -```bash -python nemo_curator/utils/split_large_files.py --input-path "/path/to/input/dir" --file-type "parquet" --output-path "/path/to/output/dir" --target-size-mb 128 -``` - -It supports splitting JSONL or Parquet files as specified by the `--file-type` argument. - -Another option is running file splitting within your existing script. For example, you can split large JSONL files with: - -```python -import ray -from nemo_curator.core.client import RayClient -from nemo_curator.utils.split_large_files import split_jsonl_file_by_size - -# Start Ray client as usual -ray_client = RayClient() -ray_client.start() - -input_files = [] # your list of input jsonl files - -ray.get( - [ - split_jsonl_file_by_size.remote( - input_file=f, - output_path="/path/to/output/dir", - target_size_mb=128, - ) - for f in input_files - ] -) - -# initialize your Curator pipeline with JsonlReader, etc. -``` - -Similarly for Parquet files: - -```python -import ray -from nemo_curator.core.client import RayClient -from nemo_curator.utils.split_large_files import split_parquet_file_by_size - -# Start Ray client as usual -ray_client = RayClient() -ray_client.start() - -input_files = [] # your list of input parquet files - -ray.get( - [ - split_parquet_file_by_size.remote( - input_file=f, - output_path="/path/to/output/dir", - target_size_mb=128, - ) - for f in input_files - ] -) - -# initialize your Curator pipeline with ParquetReader, etc. -``` - -## Output Integration - -Both readers produce `DocumentBatch` tasks that integrate seamlessly with: - -- **Processing Stages**: Apply filters, transformations, and quality checks. -- **Writer Stages**: Export to JSONL, Parquet, or other formats. -- **Analysis Tools**: Convert to Pandas/PyArrow for inspection and debugging. - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/base/download.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Any - -from loguru import logger - -from nemo_curator.stages.base import ProcessingStage -from nemo_curator.stages.resources import Resources -from nemo_curator.tasks import FileGroupTask - - -class DocumentDownloader(ABC): - """Abstract base class for document downloaders.""" - - def __init__(self, download_dir: str, verbose: bool = False): - """Initialize the downloader. - - Args: - download_dir: Directory to store downloaded files - verbose: If True, logs detailed download information - """ - self._download_dir = download_dir - self._verbose = verbose - os.makedirs(download_dir, exist_ok=True) - - @abstractmethod - def _get_output_filename(self, url: str) -> str: - """Generate output filename from URL. - - Args: - url: URL to download - - Returns: - Output filename (without directory path) - """ - ... - - @abstractmethod - def _download_to_path(self, url: str, path: str) -> tuple[bool, str | None]: - """Download URL to specified path. - - Args: - url: URL to download - path: Local path to save file - - Returns: - Tuple of (success, error_message). If success is True, error_message should be None. - If success is False, error_message should contain the error details. - """ - ... - - def download(self, url: str) -> str | None: - """Download a document from URL with temporary file handling. - - Downloads file to temporary location then atomically moves to final path. - Checks for existing file to avoid re-downloading. Supports resumable downloads. - Args: - url: URL to download - - Returns: - Path to downloaded file, or None if download failed - """ - # Generate output filename - output_name = self._get_output_filename(url) - output_file = os.path.join(self._download_dir, output_name) - temp_file = output_file + ".tmp" - - # If final file exists and is non-empty, assume it's complete - if os.path.exists(output_file) and os.path.getsize(output_file) > 0: - if self._verbose: - logger.info(f"File: {output_file} exists. Not downloading") - return output_file - - # Download to temporary file - success, error_message = self._download_to_path(url, temp_file) - - if success: - # Download successful, atomically move temp file to final location - os.rename(temp_file, output_file) - if self._verbose: - file_size = os.path.getsize(output_file) - logger.info(f"Successfully downloaded to {output_file} ({file_size} bytes)") - return output_file - else: - # Download failed - logger.error(f"Failed to download to {output_file}: {error_message}") - return None - - def num_workers_per_node(self) -> int | None: - """Number of workers per node for Downloading. This is sometimes needed to ensure we are not overloading the network. - - Returns: - Number of workers per node, or None if there is no limit and we can download as fast as possible - """ - return None - - -@dataclass -class DocumentDownloadStage(ProcessingStage[FileGroupTask, FileGroupTask]): - """Stage that downloads files from URLs to local storage. - - Takes a FileGroupTask with URLs and returns a FileGroupTask with local file paths. - This allows the download step to scale independently from iteration/extraction. - """ - - resources = Resources(cpus=0.5) - downloader: DocumentDownloader - batch_size = None - - def __post_init__(self): - self.name = f"download_{self.downloader.__class__.__name__.lower()}" - - def inputs(self) -> tuple[list[str], list[str]]: - """Define input requirements - expects FileGroupTask with URLs.""" - return (["data"], []) - - def outputs(self) -> tuple[list[str], list[str]]: - """Define output - produces FileGroupTask with local paths.""" - return (["data"], []) - - def process(self, task: FileGroupTask) -> FileGroupTask: - """Download URLs to local files. - - Args: - task (FileGroupTask): Task containing URLs to download - - Returns: - FileGroupTask: Task containing local file paths - """ - local_files = [] - - for url in task.data: - downloaded_file = self.downloader.download(url) - if downloaded_file: - local_files.append(downloaded_file) - - return FileGroupTask( - task_id=task.task_id, - dataset_name=task.dataset_name, - data=local_files, - _metadata={ - **task._metadata, - "source_files": local_files, # Add downloaded files for deterministic naming during write stage - }, - _stage_perf=task._stage_perf, - ) - - def xenna_stage_spec(self) -> dict[str, Any]: - return { - "num_workers_per_node": self.downloader.num_workers_per_node(), - } - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/base/extract.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod -from typing import Any - - -class DocumentExtractor(ABC): - """Abstract base class for document extractors. - - Takes a record dict and returns processed record dict or None to skip. - Can transform any fields in the input dict. - """ - - @abstractmethod - def extract(self, record: dict[str, str]) -> dict[str, Any] | None: - """Extract/transform a record dict into final record dict.""" - ... - - @abstractmethod - def input_columns(self) -> list[str]: - """Define input columns - produces DocumentBatch with records.""" - ... - - @abstractmethod - def output_columns(self) -> list[str]: - """Define output columns - produces DocumentBatch with records.""" - ... - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/base/stage.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass - -from nemo_curator.stages.base import CompositeStage, ProcessingStage -from nemo_curator.tasks import DocumentBatch, _EmptyTask - -from .download import DocumentDownloader, DocumentDownloadStage -from .extract import DocumentExtractor -from .iterator import DocumentIterateExtractStage, DocumentIterator -from .url_generation import URLGenerationStage, URLGenerator - - -@dataclass -class DocumentDownloadExtractStage(CompositeStage[_EmptyTask, DocumentBatch]): - """Composite stage that combines URL generation, download, and iterate-extract stages. - - This supports the full 3-step pipeline pattern like Common Crawl: - 1. Generate URLs from minimal input - 2. Download files from URLs - 3. Iterate through files to extract structured content - - """ - - url_generator: URLGenerator - downloader: DocumentDownloader - iterator: DocumentIterator - extractor: DocumentExtractor | None = None - url_limit: int | None = None - record_limit: int | None = None - add_filename_column: bool | str = True - # Restart worker Process every N tasks to mitigate memory fragmentation - # Only used if executor is Ray Data - extractor_max_calls_per_worker: int | None = None - - def __post_init__(self): - """Initialize the constituent stages.""" - # URL generation stage - url_stage = URLGenerationStage( - url_generator=self.url_generator, - limit=self.url_limit, - ) - - # Download stage - download_stage = DocumentDownloadStage( - downloader=self.downloader, - ) - - # Iterate-extract stage - iterate_extract_stage = DocumentIterateExtractStage( - iterator=self.iterator, - extractor=self.extractor, - record_limit=self.record_limit, - add_filename_column=self.add_filename_column, - max_calls_per_worker=self.extractor_max_calls_per_worker, - ) - - stages = [url_stage, download_stage, iterate_extract_stage] - self.stages = stages - - url_generator_name = self.url_generator.__class__.__name__.lower() - downloader_name = self.downloader.__class__.__name__.lower() - self.name = f"document_download_extract_{url_generator_name}_{downloader_name}_composite" - super().__init__() - - def decompose(self) -> list[ProcessingStage]: - """Decompose into constituent stages.""" - return self.stages - - def get_description(self) -> str: - """Get description of this composite stage.""" - return f"URL-Download-Iterate-Extract pipeline using {self.url_generator.__class__.__name__} and {self.downloader.__class__.__name__}" - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/common_crawl/stage.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Literal - -from loguru import logger - -from nemo_curator.stages.base import ProcessingStage -from nemo_curator.stages.text.download import DocumentDownloadExtractStage -from nemo_curator.stages.text.download.html_extractors import HTMLExtractorAlgorithm -from nemo_curator.stages.text.download.html_extractors.justext import JusTextExtractor - -from .download import CommonCrawlWARCDownloader -from .extract import CommonCrawlHTMLExtractor -from .url_generation import MainCommonCrawlUrlGenerator, NewsCommonCrawlUrlGenerator -from .warc_iterator import CommonCrawlWarcIterator - - -class CommonCrawlDownloadExtractStage(DocumentDownloadExtractStage): - """Composite stage for downloading and processing Common Crawl data. - - This pipeline: - 1. Generates WARC URLs (either from main or news crawls) - 2. Downloads WARC files - 3. Extracts content from WARC files - 4. Extracts text from HTML content - """ - - def __init__( # noqa: PLR0913 - self, - start_snapshot: str, - end_snapshot: str, - download_dir: str, - crawl_type: Literal["main", "news"] = "main", - html_extraction: HTMLExtractorAlgorithm | str | None = None, - html_extraction_kwargs: dict | None = None, - stop_lists: dict[str, frozenset[str]] | None = None, - use_aws_to_download: bool = False, - verbose: bool = False, - url_limit: int | None = None, - record_limit: int | None = None, - add_filename_column: bool | str = True, - extractor_max_calls_per_worker: int | None = None, - ): - self.crawl_type = crawl_type - self.start_snapshot = start_snapshot - self.end_snapshot = end_snapshot - - if crawl_type == "main": - self.url_generator = MainCommonCrawlUrlGenerator( - start_snapshot_str=start_snapshot, end_snapshot_str=end_snapshot, limit=url_limit - ) - else: - self.url_generator = NewsCommonCrawlUrlGenerator( - start_snapshot_str=start_snapshot, end_snapshot_str=end_snapshot, limit=url_limit - ) - - self.downloader = CommonCrawlWARCDownloader( - download_dir=download_dir, use_aws_to_download=use_aws_to_download, verbose=verbose - ) - self.iterator = CommonCrawlWarcIterator() - self.extractor = CommonCrawlHTMLExtractor( - algorithm=html_extraction, - algorithm_kwargs=html_extraction_kwargs, - stop_lists=stop_lists, - ) - if extractor_max_calls_per_worker is None and isinstance(self.extractor.algorithm, JusTextExtractor): - extractor_max_calls_per_worker = 2 - logger.info( - "jusText extraction can cause memory fragmentation and lead to OOM errors. " - "Setting extractor_max_calls_per_worker=2 for the iterate-extract stage. " - "Pass extractor_max_calls_per_worker explicitly to override." - ) - super().__init__( - url_generator=self.url_generator, - downloader=self.downloader, - iterator=self.iterator, - extractor=self.extractor, - url_limit=url_limit, - record_limit=record_limit, - add_filename_column=add_filename_column, - extractor_max_calls_per_worker=extractor_max_calls_per_worker, - ) - self.name = f"common_crawl_{self.crawl_type}_pipeline" - - def decompose(self) -> list[ProcessingStage]: - """Decompose this composite stage into its constituent stages.""" - return self.stages - - def get_description(self) -> str: - """Get a description of this composite stage.""" - return f"Common Crawl {self.crawl_type} pipeline: {self.start_snapshot} to {self.end_snapshot}" - -``` - -File: /Users/mromeijn/src/Curator/tutorials/text/download-and-extract/README.md -```md -# Download and Extract Common Crawl, Wikipedia, and ArXiv Data - -This Jupyter notebook tutorial demonstrates how to use NeMo Curator to download text data from [Common Crawl](https://commoncrawl.org/), [Wikipedia](https://dumps.wikimedia.org/backup-index.html), and [ArXiv](https://info.arxiv.org/help/bulk_data_s3.html), respectively. - -For more information about downloading and extracting data with NeMo Curator, refer to the [Download Data](https://docs.nvidia.com/nemo/curator/latest/curate-text/load-data/index.html) and [Data Acquisition Concepts](https://docs.nvidia.com/nemo/curator/latest/about/concepts/text/data-acquisition-concepts.html) documentation pages. - -Please note that the ArXiv section of the tutorial requires the [s5cmd](https://github.com/peak/s5cmd) tool to be installed and configured with proper AWS credentials. - -``` - -File: /Users/mromeijn/src/Curator/tutorials/text/llama-nemotron-data-curation/README.md -```md -# Curate the Llama Nemotron Reasoning Dataset with NVIDIA NeMo Curator - -The [Llama Nemotron Post-Training Dataset](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset) is a curated collection of approximately 30 million high-quality synthetic samples designed to enhance the reasoning capabilities of large language models. -It is organized into distinct subsets for supervised fine-tuning (SFT) or reinforcement learning (RL) and encompasses samples from various problem domains. -All samples are in JSON lines (JSONL) format and contain metadata such as license type, source model, as well as the [Llama Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/llama-nemotron/) model(s) trained with that sample. - -Each sample consists of a prompt and an expected response. Samples either include detailed chain-of-thought (CoT) reasoning traces followed by a response ("reasoning on"), or contain a direct response without reasoning traces ("reasoning off"). -Here is an example of what a sample from the dataset may look like: - -```json -{ - "input": [ - {"role": "user", "content": "Can you explain the Pythagorean theorem?"} - ], - "output": "The user is asking for an explanation of the Pythagorean theorem. This is a fundamental principle in geometry related to right-angled triangles. I should mention the formula and what each variable represents.The Pythagorean theorem states that in a right triangle, the square of the hypotenuse equals the sum of the squares of the other two sides: a² + b² = c².", - "reasoning": "on", - "system_prompt": "detailed thinking on", - "category": "math", - "license": "apache_v2", - "generator": "llama-3.3-70b", - "used_in_training": ["Ultra"], - "version": "v1" -} -``` - -The relevant attributes for this tutorial are as follows: - -- `input`: the prompt(s) to the model in the multi-turn chat completions message format. It always contains a message with the role `user`, followed by zero or more turns. -- `output`: the expected response from the model (ground truth). -- `reasoning`: whether the sample is for reasoning "on" mode or not - - If the value is "on", then the output contains a detailed CoT trace encoded inside think HTML tags followed by the output. - - If the value is "off", then the output doesn't contain any reasoning traces and contains a direct response. -- `system_prompt`: the (suggested) system prompt to control the reasoning mode of the system. For Llama Nemotron training, the system prompt is always either "detailed thinking on" or "detailed thinking off". This field is tied to the value in the `reasoning` field. -- `used_in_training`: the list of Llama Nemotron models that used this sample for training. For instance, a value of `["Ultra", "Nano"]` indicates that this sample was used for training the Ultra and Nano models, but not Super. - -This tutorial demonstrates how a user can process a subset of the Llama Nemotron dataset using NeMo Curator. The output files are created in the `input/output` JSONL format, suitable for use with various training frameworks, including [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo). You can easily modify this pipeline as you see fit and adapt it to your domain- or business-specific needs, and the resulting dataset can be used to train a reasoning model with a modest computing budget. - -## Environment Setup - -Setup requirements: - -- Hardware: This tutorial can be run entirely on CPU workers -- Recommended environment: This tutorial was developed and tested with a Conda environment - -Refer to the NeMo Curator [documentation](https://docs.nvidia.com/nemo/curator/latest/) for instructions on how to download NeMo Curator through PyPI, source, or Docker. - -## Prerequisites - -### Download Input Dataset - -The input dataset can be downloaded from Hugging Face: https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset - -The following commands can be used to download the dataset: - -```bash -# If needed: apt-get update && apt-get install -y git-lfs -git lfs install -git clone https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset -``` - -Alternatively, the dataset can be downloaded using Python: - -```python -from huggingface_hub import snapshot_download - -snapshot_download( - repo_id="nvidia/Llama-Nemotron-Post-Training-Dataset", - repo_type="dataset", - local_dir="/path/to/save/data", - # allow_patterns=["SFT/chat/chat.jsonl", "SFT/math/math_v1.1.jsonl"], # Select specific files or directories (if desired) -) -``` - -Ensure that the dataset was downloaded correctly. You can verify with the following commands: - -```bash -$ ls /path/to/Llama-Nemotron-Post-Training-Dataset/SFT -chat code math safety science -$ du -sh /path/to/Llama-Nemotron-Post-Training-Dataset/SFT -122G /path/to/Llama-Nemotron-Post-Training-Dataset/SFT -``` - -The above example ensures that the full SFT dataset was downloaded and is ready to use for the tutorial. If you only selected a subset of the data to download, then you should check that it matches the files on the [Hugging Face page](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset). - -### Tokenizer Access Instructions - -The tokenizer used by this tutorial is called [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Using it requires requesting access: - -1. Visit the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model page on Hugging Face. -2. Click "Access request". -3. Fill out the form and wait for approval. -4. After approval, log in to your Hugging Face account using the Hugging Face CLI. In the terminal, run `huggingface-cli login`. - -### Download FastText Language Identification Model - -The FastText language identification model is used to identify and filter out non-English text from the dataset. It can be downloaded from the FastText language identification page: https://fasttext.cc/docs/en/language-identification.html - -Use the following command to download the FastText language identification model to your current working directory: - -```bash -wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz -P ./ -``` - -## Usage - -This tutorial can be run with: - -```bash -LOGURU_LEVEL="ERROR" python main.py \ - --input-dir "/path/to/Llama-Nemotron-Post-Training-Dataset/SFT" \ - --filename-filter "chat" "math_v1.1" \ - --jsonl-blocksize-mb 100 \ - --tokenizer "meta-llama/Llama-3.1-8B-Instruct" \ - --lang-id-model-path "/path/to/lid.176.ftz" \ - --max-token-count 16384 \ - --max-completion-token-count 8192 \ - --keep-columns "input" "output" \ - --output-dir "/path/to/curated-data" \ - --num-cpus 16 -``` - -Setting `LOGURU_LEVEL="ERROR"` minimizes log output. Remove it when debugging. If you encounter issues, see the **Debugging Out of Memory Errors** section for help (reducing `--num-cpus` is the most common fix). - -Set `--hf-token` as needed for the tokenizer. - -Since the entire input dataset is very large, we recommend curating a focused subset of the data that aligns closely with your domain-specific tasks. To help with this, we provide a way to filter files before reading. There are many ways to subset the Llama Nemotron dataset, but we recommend starting with the math and chat subsets because they contain strong examples of domain-agnostic reasoning. To filter files by name, pass `--filename-filter` followed by any number of strings, such as "chat" and "math_v1.1". When reading the input data directory, the list of files will be filtered to only include files with names containing at least one of the strings provided by `--filename-filter`. If `--filename-filter` is not specified, then all files within the directory (over 30 million rows) will be used. - -The above script applies basic filtering to the input dataset: - -- Only take samples used for Nemotron Nano training. -- Remove empty and malformed samples. -- Remove non-English samples. -- Remove samples with total length (system prompt, input, and output responses) longer than 16k tokens (with chat template applied using the tokenizer). -- Remove samples with output responses longer than 8k tokens (with chat template applied using the tokenizer). -- Only keep columns specified by the `--keep-columns` parameter. We recommend keeping the "input", "output", and "completion_token_count" columns (the "completion_token_count" column always needs to be kept, so that the samples can be sorted). - -After filtering, it sorts all samples by completion (output response) length, then interleaves thinking ON and thinking OFF samples for curriculum learning. Samples are sorted in increasing order of difficulty, using the completion token count as a measure of difficulty. By default, records are interleaved one at a time (alternating one thinking ON sample with one thinking OFF sample). Pass `--chunk-size` followed by an integer to interleave in larger groups (for example, 10 or 100 records at a time). Interleaving samples from the "reasoning on" and "reasoning off" buckets gradually introduces complexity. - -## System Requirements - -- **Memory**: This tutorial can be CPU-only but is memory-intensive. For smaller memory systems, use `--filename-filter` to select a subset of the data. -- **CPU allocation**: The `--num-cpus` parameter controls parallelism. Each CPU worker processes data in parallel, so more CPUs means more memory usage. Start with a conservative value and increase gradually. - -## Debugging Out-of-Memory Errors - -If you encounter out-of-memory (OOM) errors: - -1. **Reduce partition size**: Lower the blocksize to reduce per-partition memory. Set `--jsonl-blocksize-mb 50` (default is 100 MB). -2. **Reduce CPU count**: Lower `--num-cpus` to reduce parallel memory pressure rather than using all available cores. -3. **Subset the data**: Use `--filename-filter` to process only specific subsets relevant to your use case (such as `--filename-filter "chat"`). - -## Next Steps - -To see how to train a reasoning model with the resulting dataset, refer to this NeMo tutorial: [Train Your Own Reasoning Model in 48 Hours on a Single GPU](https://github.com/NVIDIA/NeMo/tree/main/tutorials/llm/reasoning). - -The NeMo tutorial expects the `/path/to/curated-data/training.jsonl` file generated by this tutorial as input. - -``` - -File: /Users/mromeijn/src/Curator/tutorials/text/llama-nemotron-data-curation/main.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os -import time - -import ray -from filters.heuristic_filters import ( - ContainsThinkOpenTagFilter, - EmptyThinkTagsFilter, - MissingThinkCloseTagFilter, - MissingThinkOpenTagFilter, - NanoFilter, - ThinkingOnFilter, - malformed_filter, -) -from filters.model_filters import ApplyChatTemplate, CompletionTokenCountFilter, NonEnglishFilter, TokenCountFilter -from utils.jsonl_utils import interleave_datasets - -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.filters import ScoreFilter -from nemo_curator.stages.text.io.reader.jsonl import JsonlReader -from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter -from nemo_curator.utils.file_utils import get_all_file_paths_under -from nemo_curator.utils.split_large_files import split_jsonl_file_by_size - - -def main(args: argparse.Namespace) -> None: # noqa: PLR0915 - try: - os.makedirs(args.output_dir, exist_ok=False) - except FileExistsError as e: - msg = f"Output directory already exists: {args.output_dir}. Please delete or rename it and try again." - raise FileExistsError(msg) from e - - # Initialize and start Ray client with the number of CPUs specified by the user - ray_client = RayClient(num_cpus=args.num_cpus) - ray_client.start() - - # Initialize pipelines - pipeline_thinking_on = Pipeline( - name="curriculum_learning_thinking_on", description="Prepare dataset for curriculum learning with thinking ON." - ) - pipeline_thinking_off = Pipeline( - name="curriculum_learning_thinking_off", - description="Prepare dataset for curriculum learning with thinking OFF.", - ) - - start_time = time.time() - - # Handle input path - input_files = list(get_all_file_paths_under(args.input_dir, recurse_subdirectories=True, keep_extensions="jsonl")) - if args.filename_filter: - # Filter out files that don't contain any of the provided substrings - input_files = [filename for filename in input_files if any(s in filename for s in args.filename_filter)] - - input_dir = os.path.join(args.output_dir, "input_data_shards") - os.makedirs(input_dir, exist_ok=False) - - # Split into smaller files for parallel processing - ray.get( - [ - split_jsonl_file_by_size.remote( - input_file=f, - output_path=input_dir, - target_size_mb=args.jsonl_blocksize_mb, - ) - for f in input_files - ] - ) - - # Read files for each pipeline - pipeline_thinking_on.add_stage(JsonlReader(file_paths=input_dir)) - pipeline_thinking_off.add_stage(JsonlReader(file_paths=input_dir)) - - # Split pipelines into thinking ON and OFF - pipeline_thinking_on.add_stage(ScoreFilter(ThinkingOnFilter(), text_field="reasoning")) - pipeline_thinking_off.add_stage(ScoreFilter(ThinkingOnFilter(), text_field="reasoning", invert=True)) - - # Filter out samples based on various criteria - filter_steps = [ - ScoreFilter( - NanoFilter(), - text_field="used_in_training", - ), - ScoreFilter( - EmptyThinkTagsFilter(), - text_field="output", - ), - malformed_filter, - ScoreFilter( - MissingThinkCloseTagFilter(), - text_field="output", - ), - ] - for filter_step in filter_steps: - pipeline_thinking_on.add_stage(filter_step) - pipeline_thinking_off.add_stage(filter_step) - - # Filter out samples in thinking OFF that contain think tags - pipeline_thinking_off.add_stage( - ScoreFilter( - ContainsThinkOpenTagFilter(), - text_field="output", - ) - ) - # Filter out samples in thinking ON that do not contain think tags - pipeline_thinking_on.add_stage( - ScoreFilter( - MissingThinkOpenTagFilter(), - text_field="output", - ) - ) - - # Filter out samples based on token count - tokenizer_steps = [ - NonEnglishFilter( - tokenizer_identifier=args.tokenizer, - hf_token=args.hf_token, - lang_id_model_path=args.lang_id_model_path, - input_field="input", - output_field="output", - system_prompt_field="system_prompt", - ), - TokenCountFilter( - tokenizer_identifier=args.tokenizer, - hf_token=args.hf_token, - max_token_count=args.max_token_count, - input_field="input", - output_field="output", - system_prompt_field="system_prompt", - ), - CompletionTokenCountFilter( - tokenizer_identifier=args.tokenizer, - hf_token=args.hf_token, - max_completion_token_count=args.max_completion_token_count, - output_field="output", - ), - ApplyChatTemplate( - tokenizer_identifier=args.tokenizer, - hf_token=args.hf_token, - input_field="input", - output_field="output", - system_prompt_field="system_prompt", - ), - ] - for tokenizer_step in tokenizer_steps: - pipeline_thinking_on.add_stage(tokenizer_step) - pipeline_thinking_off.add_stage(tokenizer_step) - - if args.keep_columns: - keep_columns = args.keep_columns - # Always keep the completion_token_count column, so that we can sort the samples - if "completion_token_count" not in keep_columns: - keep_columns.append("completion_token_count") - else: - keep_columns = ["input", "output", "completion_token_count"] - - # Save intermediate datasets - thinking_on_unsorted_path = os.path.join(args.output_dir, "thinking_on_unsorted") - thinking_off_unsorted_path = os.path.join(args.output_dir, "thinking_off_unsorted") - pipeline_thinking_on.add_stage(JsonlWriter(thinking_on_unsorted_path, fields=keep_columns)) - pipeline_thinking_off.add_stage(JsonlWriter(thinking_off_unsorted_path, fields=keep_columns)) - - # Run pipelines - _thinking_on_output = pipeline_thinking_on.run() - _thinking_off_output = pipeline_thinking_off.run() - - # Sort datasets - thinking_on_ds = ray.data.read_json(thinking_on_unsorted_path, lines=True) - thinking_on_ds = thinking_on_ds.sort("completion_token_count") - thinking_on_sorted_path = os.path.join(args.output_dir, "thinking_on_sorted") - thinking_on_ds.write_json(thinking_on_sorted_path, orient="records", lines=True) - - thinking_off_ds = ray.data.read_json(thinking_off_unsorted_path, lines=True) - thinking_off_ds = thinking_off_ds.sort("completion_token_count") - thinking_off_sorted_path = os.path.join(args.output_dir, "thinking_off_sorted") - thinking_off_ds.write_json(thinking_off_sorted_path, orient="records", lines=True) - - # Interleave datasets and combine into a single output file - interleave_datasets( - thinking_on_sorted_path, - thinking_off_sorted_path, - os.path.join(args.output_dir, "training.jsonl"), - chunk_size=args.chunk_size, - ) - - end_time = time.time() - print(f"Total time taken: {end_time - start_time} seconds") - - ray_client.stop() - - -def attach_args() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - "Prepare dataset for curriculum learning.", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--num-cpus", - type=int, - default=16, - help="Number of CPUs to use.", - ) - - parser.add_argument( - "--input-dir", - type=str, - help="Path to the input directory containing JSONL files.", - required=True, - ) - parser.add_argument( - "--filename-filter", - nargs="+", - type=str, - help="If specified, only files with names containing one or more of the provided substrings will be processed.", - ) - parser.add_argument( - "--jsonl-blocksize-mb", - type=int, - default=100, - help="Blocksize (in MB) to use for splitting the JSONL files.", - ) - - parser.add_argument( - "--tokenizer", - type=str, - default="meta-llama/Llama-3.1-8B-Instruct", - help="Hugging Face tokenizer", - ) - parser.add_argument( - "--hf-token", - type=str, - help="Hugging Face token (if needed)", - ) - parser.add_argument( - "--lang-id-model-path", - type=str, - help="Path to the FastText model", - required=True, - ) - parser.add_argument( - "--max-token-count", - type=int, - default=16384, - help="Optional maximum token count. Rows exceeding this count will be filtered out.", - ) - parser.add_argument( - "--max-completion-token-count", - type=int, - default=8192, - help="Optional maximum completion token count. Rows exceeding this count will be filtered out.", - ) - - parser.add_argument( - "--keep-columns", - nargs="+", - type=str, - help="Columns to keep when the dataset is written to disk.", - ) - - parser.add_argument( - "--chunk-size", - type=int, - default=1, - help="Chunk size to use for interleaving the datasets.", - ) - - parser.add_argument( - "--output-dir", - type=str, - help="Path to the output directory.", - required=True, - ) - - return parser - - -if __name__ == "__main__": - main(attach_args().parse_args()) - -``` - diff --git a/skills/nemotron-customize/context/curator-processing-language-quality.txt b/skills/nemotron-customize/context/curator-processing-language-quality.txt deleted file mode 100644 index 87c033d87..000000000 --- a/skills/nemotron-customize/context/curator-processing-language-quality.txt +++ /dev/null @@ -1,3302 +0,0 @@ - -/Users/mromeijn/src/Curator -├── docs -│ ├── about -│ │ ├── concepts -│ │ │ ├── text -│ │ │ │ ├── _images -│ │ │ │ └── data-processing-concepts.md * -│ │ │ ├── audio -│ │ │ ├── image -│ │ │ └── video -│ │ │ └── _images -│ │ └── release-notes -│ ├── curate-text -│ │ ├── process-data -│ │ │ ├── language-management -│ │ │ │ └── language.md * -│ │ │ ├── quality-assessment -│ │ │ │ ├── classifier.md * -│ │ │ │ └── distributed-classifier.md * -│ │ │ ├── content-processing -│ │ │ ├── deduplication -│ │ │ ├── specialized-processing -│ │ │ └── index.md * -│ │ ├── load-data -│ │ ├── synthetic -│ │ │ └── nemotron-cc -│ │ └── tutorials -│ ├── _extensions -│ │ ├── ai_assistant -│ │ │ ├── assets -│ │ │ │ └── styles -│ │ │ ├── core -│ │ │ ├── integrations -│ │ │ └── ui -│ │ ├── content_gating -│ │ ├── json_output -│ │ │ ├── content -│ │ │ ├── core -│ │ │ └── processing -│ │ ├── rich_metadata -│ │ │ └── templates -│ │ └── search_assets -│ │ ├── modules -│ │ └── templates -│ ├── _images -│ ├── _templates -│ ├── admin -│ │ ├── deployment -│ │ │ └── slurm -│ │ └── integrations -│ ├── curate-audio -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── asr-inference -│ │ │ ├── audio-analysis -│ │ │ ├── quality-assessment -│ │ │ └── text-integration -│ │ └── tutorials -│ ├── curate-images -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── embeddings -│ │ │ └── filters -│ │ └── tutorials -│ ├── curate-video -│ │ ├── load-data -│ │ ├── process-data -│ │ └── tutorials -│ │ ├── _images -│ │ └── pipeline-customization -│ ├── get-started -│ └── reference -│ └── infrastructure -├── nemo_curator -│ ├── config -│ │ └── text -│ │ └── heuristic_filter_english_pipeline.yaml * -│ ├── stages -│ │ ├── text -│ │ │ ├── classifiers -│ │ │ │ ├── base.py * + -│ │ │ │ ├── domain.py * + -│ │ │ │ └── quality.py * + -│ │ │ ├── filters -│ │ │ │ ├── fasttext -│ │ │ │ ├── heuristic -│ │ │ │ │ ├── code -│ │ │ │ │ └── repetition -│ │ │ │ ├── histogram -│ │ │ │ ├── token -│ │ │ │ ├── doc_filter.py * + -│ │ │ │ └── score_filter.py * + -│ │ │ ├── deduplication -│ │ │ ├── download -│ │ │ │ ├── arxiv -│ │ │ │ ├── base -│ │ │ │ ├── common_crawl -│ │ │ │ ├── html_extractors -│ │ │ │ │ └── utils -│ │ │ │ └── wikipedia -│ │ │ ├── embedders -│ │ │ ├── io -│ │ │ │ ├── reader -│ │ │ │ └── writer -│ │ │ ├── models -│ │ │ ├── modifiers -│ │ │ │ ├── fasttext -│ │ │ │ ├── string -│ │ │ │ └── unicode -│ │ │ ├── modules -│ │ │ └── utils -│ │ ├── audio -│ │ │ ├── advanced_pipelines -│ │ │ │ └── audio_data_filter -│ │ │ ├── alm -│ │ │ ├── datasets -│ │ │ │ ├── fleurs -│ │ │ │ └── readspeech -│ │ │ ├── filtering -│ │ │ │ ├── band_filter_module -│ │ │ │ └── sigmos_filter_module -│ │ │ │ └── third_party -│ │ │ │ └── sigmos -│ │ │ ├── inference -│ │ │ ├── io -│ │ │ ├── metrics -│ │ │ ├── postprocessing -│ │ │ ├── preprocessing -│ │ │ └── segmentation -│ │ │ └── speaker_separation_module -│ │ ├── deduplication -│ │ │ ├── exact -│ │ │ ├── fuzzy -│ │ │ │ └── lsh -│ │ │ ├── semantic -│ │ │ └── shuffle_utils -│ │ ├── image -│ │ │ ├── deduplication -│ │ │ ├── embedders -│ │ │ ├── filters -│ │ │ └── io -│ │ ├── interleaved -│ │ │ ├── filter -│ │ │ ├── io -│ │ │ │ ├── readers -│ │ │ │ └── writers -│ │ │ ├── pdf -│ │ │ │ └── nemotron_parse -│ │ │ └── utils -│ │ ├── math -│ │ │ ├── classifiers -│ │ │ ├── download -│ │ │ │ └── html_extractors -│ │ │ └── modifiers -│ │ ├── synthetic -│ │ │ ├── nemo_data_designer -│ │ │ └── nemotron_cc -│ │ │ └── nemo_data_designer -│ │ └── video -│ │ ├── caption -│ │ ├── clipping -│ │ ├── embedding -│ │ ├── filtering -│ │ ├── io -│ │ └── preview -│ ├── backends -│ │ ├── internal -│ │ │ └── raft -│ │ ├── ray_actor_pool -│ │ ├── ray_data -│ │ └── xenna -│ ├── core -│ ├── metrics -│ ├── models -│ │ └── client -│ ├── pipeline -│ ├── tasks -│ └── utils -├── tutorials -│ ├── text -│ │ ├── distributed-data-classification -│ │ │ └── README.md * -│ │ ├── deduplication -│ │ │ ├── fuzzy -│ │ │ └── semantic -│ │ ├── download-and-extract -│ │ ├── gliner-pii-redaction -│ │ ├── llama-nemotron-data-curation -│ │ │ ├── filters -│ │ │ └── utils -│ │ ├── megatron-tokenizer -│ │ ├── peft-curation -│ │ └── tinystories -│ ├── audio -│ │ ├── alm -│ │ ├── callhome_diar -│ │ ├── fleurs -│ │ ├── readspeech -│ │ └── single_speaker_filter -│ ├── image -│ │ └── getting-started -│ ├── interleaved -│ │ └── nemotron_parse_pdf -│ ├── math -│ ├── multimodal -│ ├── slurm -│ ├── synthetic -│ │ ├── nemo_data_designer -│ │ └── nemotron_cc -│ │ ├── example_data -│ │ └── nemo_data_designer -│ └── video -│ └── getting-started -├── .cursor -│ └── rules -├── .github -│ ├── actions -│ │ ├── build-container -│ │ └── test-template -│ ├── scripts -│ └── workflows -│ └── config -├── benchmarking -│ ├── data_prep -│ ├── runner -│ │ └── sinks -│ ├── scripts -│ └── tools -├── docker -│ └── common -├── fern -│ ├── assets -│ │ └── images -│ ├── components -│ └── versions -│ ├── v25.09 -│ │ └── pages -│ │ ├── about -│ │ │ ├── concepts -│ │ │ │ ├── audio -│ │ │ │ ├── image -│ │ │ │ ├── text -│ │ │ │ └── video -│ │ │ └── release-notes -│ │ ├── admin -│ │ │ ├── deployment -│ │ │ └── integrations -│ │ ├── api-reference -│ │ │ ├── executors -│ │ │ └── tasks -│ │ ├── curate-audio -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ │ ├── asr-inference -│ │ │ │ ├── audio-analysis -│ │ │ │ ├── quality-assessment -│ │ │ │ └── text-integration -│ │ │ └── tutorials -│ │ ├── curate-images -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ │ ├── embeddings -│ │ │ │ └── filters -│ │ │ └── tutorials -│ │ ├── curate-text -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ │ ├── content-processing -│ │ │ │ ├── deduplication -│ │ │ │ ├── language-management -│ │ │ │ ├── quality-assessment -│ │ │ │ └── specialized-processing -│ │ │ └── tutorials -│ │ ├── curate-video -│ │ │ ├── load-data -│ │ │ ├── process-data -│ │ │ └── tutorials -│ │ │ └── pipeline-customization -│ │ ├── get-started -│ │ └── reference -│ │ └── infrastructure -│ └── v26.02 -│ └── pages -│ ├── _images -│ ├── about -│ │ ├── concepts -│ │ │ ├── audio -│ │ │ ├── image -│ │ │ ├── text -│ │ │ │ └── _images -│ │ │ └── video -│ │ │ └── _images -│ │ └── release-notes -│ ├── admin -│ │ ├── deployment -│ │ │ └── slurm -│ │ └── integrations -│ ├── api-reference -│ │ ├── executors -│ │ └── tasks -│ ├── curate-audio -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── asr-inference -│ │ │ ├── audio-analysis -│ │ │ ├── quality-assessment -│ │ │ └── text-integration -│ │ └── tutorials -│ ├── curate-images -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── embeddings -│ │ │ └── filters -│ │ └── tutorials -│ ├── curate-text -│ │ ├── load-data -│ │ ├── process-data -│ │ │ ├── content-processing -│ │ │ ├── deduplication -│ │ │ ├── language-management -│ │ │ ├── quality-assessment -│ │ │ └── specialized-processing -│ │ ├── synthetic -│ │ │ └── nemotron-cc -│ │ └── tutorials -│ ├── curate-video -│ │ ├── load-data -│ │ ├── process-data -│ │ └── tutorials -│ │ ├── _images -│ │ └── pipeline-customization -│ ├── get-started -│ └── reference -│ └── infrastructure -└── tests - ├── backends - │ ├── ray_actor_pool - │ └── ray_data - ├── config - ├── core - ├── fixtures - │ └── audio - │ └── alm - │ └── nested_manifests - │ ├── subdir_a - │ └── subdir_b - ├── metrics - ├── models - │ └── client - ├── pipelines - ├── stages - │ ├── audio - │ │ ├── advanced_pipelines - │ │ ├── alm - │ │ ├── datasets - │ │ ├── filtering - │ │ ├── inference - │ │ ├── io - │ │ ├── metrics - │ │ ├── postprocessing - │ │ ├── preprocessing - │ │ └── segmentation - │ ├── common - │ ├── deduplication - │ │ ├── exact - │ │ ├── fuzzy - │ │ ├── semantic - │ │ └── shuffle_utils - │ ├── image - │ │ ├── dedup - │ │ ├── embedders - │ │ ├── filters - │ │ └── io - │ ├── interleaved - │ │ ├── filter - │ │ ├── pdf - │ │ │ └── nemotron_parse - │ │ └── utils - │ ├── math_stages - │ │ ├── classifiers - │ │ ├── download - │ │ └── modifiers - │ ├── synthetic - │ │ ├── nemo_data_designer - │ │ └── nemotron_cc - │ │ └── nemo_data_designer - │ ├── text - │ │ ├── classifiers - │ │ ├── deduplication - │ │ ├── download - │ │ │ ├── arxiv - │ │ │ ├── base - │ │ │ ├── common_crawl - │ │ │ └── wikipedia - │ │ ├── embedders - │ │ ├── io - │ │ │ ├── reader - │ │ │ └── writer - │ │ ├── models - │ │ └── modules - │ └── video - │ ├── caption - │ │ └── fixtures - │ ├── clipping - │ ├── embedding - │ ├── filtering - │ ├── io - │ └── preview - ├── tasks - └── utils - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; selected files shown. - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/base.py -Imports: - - import contextlib - - import copy - - import time - - from abc import ABC, ABCMeta, abstractmethod - - from inspect import isabstract - - from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, final - - from loguru import logger - - from nemo_curator.stages.resources import Resources - - from nemo_curator.tasks import Task - - from nemo_curator.backends.base import NodeInfo, WorkerMetadata ---- -Classes: - - StageMeta - Methods: - - L46: def __new__(mcls, name, bases, namespace, **kwargs): - - ProcessingStage - Methods: - - L92: def _name(self) -> str: - - L97: def _resources(self) -> Resources: - - L102: def _batch_size(self) -> int | None: - - L106: def __init_subclass__(cls, **kwargs): - - L127: def num_workers(self) -> int | None: - - L131: def validate_input(self, task: Task) -> bool: - - L161: def process(self, task: X) -> Y | list[Y]: - - L171: def process_batch(self, tasks: list[X]) -> list[Y]: - - L201: def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None: - - L209: def setup(self, worker_metadata: WorkerMetadata | None = None) -> None: - - L217: def teardown(self) -> None: - - L222: def supports_batch_processing(self) -> bool: - - L230: def __repr__(self) -> str: - - L234: def inputs(self) -> tuple[list[str], list[str]]: - - L244: def outputs(self) -> tuple[list[str], list[str]]: - - L254: def xenna_stage_spec(self) -> dict[str, Any]: - - L262: def with_( - self, - name: str | None = None, - resources: Resources | None = None, - batch_size: int | None = None, - runtime_env: dict[str, Any] | None = None, - ) -> ProcessingStage: - - L293: def get_config(self) -> dict[str, Any]: - - L305: def ray_stage_spec(self) -> dict[str, Any]: - - L316: def _log_metrics(self, metrics: dict[str, float]) -> None: - - L327: def _log_metric(self, name: str, value: float) -> None: - - L331: def _time_metric(self, name: str) -> contextlib.AbstractContextManager[None]: - - L339: def _consume_custom_metrics(self) -> dict[str, float]: - Properties: - - _is_abstract_root - - name - - resources - - batch_size - - runtime_env - - CompositeStage - Methods: - - L359: def __init__(self): - - L362: def inputs(self) -> tuple[list[str], list[str]]: - - L366: def outputs(self) -> tuple[list[str], list[str]]: - - L371: def decompose(self) -> list[ProcessingStage]: - - L381: def with_(self, stage_with_dict: dict[str, Any]) -> CompositeStage: - - L387: def decompose_and_apply_with(self) -> list[ProcessingStage]: - - L391: def _apply_with_(self, stages: list[ProcessingStage]) -> list[ProcessingStage]: - - L419: def process(self, task: X) -> Y | list[Y]: - - L425: def get_description(self) -> str: - -Functions: - - L62: def get_stage_class(name: str) -> type[ProcessingStage]: - -Global vars: - - X - - Y - - _STAGE_REGISTRY ---- - - -File: /Users/mromeijn/src/Curator/nemo_curator/tasks/document.py -Imports: - - from dataclasses import dataclass, field - - import pandas as pd - - import pyarrow as pa - - from loguru import logger - - from .tasks import Task ---- -Classes: - - DocumentBatch - Methods: - - L34: def to_pyarrow(self) -> pa.Table: - - L44: def to_pandas(self) -> pd.DataFrame: - - L55: def num_items(self) -> int: - - L59: def get_columns(self) -> list[str]: - - L69: def validate(self) -> bool: - Properties: - - data ---- - - - -File: /Users/mromeijn/src/Curator/docs/about/concepts/text/data-processing-concepts.md -```md ---- -description: "Text processing workflows including quality filtering, fuzzy deduplication, content cleaning, and pipeline design" -categories: ["concepts-architecture"] -tags: ["data-processing", "quality-filtering", "deduplication", "pipeline", "distributed"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "concept" -modality: "text-only" ---- - -(about-concepts-text-data-processing)= -# Text Processing Concepts - -This guide covers the most common text processing workflows in NVIDIA NeMo Curator, based on real-world usage patterns from production data curation pipelines. - -## Most Common Workflows - -The majority of NeMo Curator users follow these core workflows, typically in this order: - -### 1. Quality Filtering - -Most users start with basic quality filtering using heuristic filters to remove low-quality content: - -**Essential Quality Filters:** - -- `WordCountFilter` - Remove too short/long documents -- `NonAlphaNumericFilter` - Remove symbol-heavy content -- `RepeatedLinesFilter` - Remove if content is too repetitive -- `PunctuationFilter` - Ensure proper sentence structure -- `BoilerPlateStringFilter` - Remove if content contains too much template/boilerplate text - -### 2. Content Cleaning and Modification - -Basic text normalization and cleaning operations: - -**Common Cleaning Steps:** - -- `UnicodeReformatter` - Normalize Unicode characters -- `NewlineNormalizer` - Standardize line breaks -- Basic HTML/markup removal - -### 3. Deduplication - -Remove duplicate and near-duplicate content. For comprehensive coverage of all deduplication approaches, refer to Curator's [Deduplication Concepts](about-concepts-deduplication). - -#### Exact Deduplication - -Remove identical documents, especially useful for smaller datasets: - -**Implementation:** MD5 or SHA-256 hashing for document identification - -#### Fuzzy Deduplication - -For production datasets, fuzzy deduplication is essential to remove near-duplicate content across sources: - -**Key Components:** - -- Ray distributed computing framework for scalability -- Connected components clustering for duplicate identification - -#### Semantic Deduplication - -Remove semantically similar content using embeddings for more sophisticated duplicate detection. - -## Core Processing Architecture - -NeMo Curator uses these fundamental building blocks that users combine into pipelines: - -```{list-table} -:header-rows: 1 - -* - Component - - Purpose - - Usage Pattern -* - **`Pipeline`** - - Orchestrate processing stages - - Add processing stages, typically starting with a read and completing with a write -* - **`ScoreFilter`** - - Apply filters with optional scoring - - Chain multiple quality filters -* - **`Modify`** - - Transform document content - - Clean and normalize text -* - **Reader/Writer Stages** - - Load and save text data - - Input/output for pipelines -* - **Processing Stages** - - Transform DocumentBatch tasks - - Core processing components -``` - -## Implementation Examples - -### Complete Quality Filtering Pipeline - -This is the most common starting workflow, used in 90% of production pipelines: - -:::{dropdown} Quality Filtering Pipeline Code Example -:icon: code-square - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.filters import ScoreFilter -from nemo_curator.stages.text.filters.heuristic.repetition import RepeatedLinesFilter -from nemo_curator.stages.text.filters.heuristic import ( - WordCountFilter, - NonAlphaNumericFilter, - PunctuationFilter, - BoilerPlateStringFilter -) - -# Start Ray client -ray_client = RayClient() -ray_client.start() - -# Create processing pipeline -pipeline = Pipeline(name="quality_filtering") - -# Load dataset - the starting point for all workflows -reader = JsonlReader(file_paths="input_data/") -pipeline.add_stage(reader) - -# Standard quality filtering pipeline (most common) -# Remove too short/long documents (essential) -# and save the word_count field -word_count_filter = ScoreFilter( - filter_obj=WordCountFilter(min_words=50, max_words=100000), - text_field="text", - score_field="word_count" -) -pipeline.add_stage(word_count_filter) - -# Remove symbol-heavy content -alpha_numeric_filter = ScoreFilter( - filter_obj=NonAlphaNumericFilter(max_non_alpha_numeric_to_text_ratio=0.25), - text_field="text" -) -pipeline.add_stage(alpha_numeric_filter) - -# Remove repetitive content -repeated_lines_filter = ScoreFilter( - filter_obj=RepeatedLinesFilter(max_repeated_line_fraction=0.7), - text_field="text" -) -pipeline.add_stage(repeated_lines_filter) - -# Ensure proper sentence structure -punctuation_filter = ScoreFilter( - filter_obj=PunctuationFilter(max_num_sentences_without_endmark_ratio=0.85), - text_field="text" -) -pipeline.add_stage(punctuation_filter) - -# Remove template/boilerplate text -boilerplate_filter = ScoreFilter( - filter_obj=BoilerPlateStringFilter(), - text_field="text" -) -pipeline.add_stage(boilerplate_filter) - -# Add writer stage -writer = JsonlWriter(path="filtered_data/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() - -# Cleanup Ray when done -ray_client.stop() -``` - -::: - -### Content Cleaning Pipeline - -Basic text normalization: - -:::{dropdown} Content Cleaning Pipeline Code Example -:icon: code-square - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.modifiers import Modify -from nemo_curator.stages.text.modifiers.unicode import UnicodeReformatter - -# Start Ray client -ray_client = RayClient() -ray_client.start() - -# Create cleaning pipeline -pipeline = Pipeline(name="content_cleaning") - -# Read input data -reader = JsonlReader(file_paths="input_data/") -pipeline.add_stage(reader) - -# Essential cleaning steps -# Normalize unicode characters (very common) -unicode_modifier = Modify( - modifier_fn=UnicodeReformatter(), - input_fields="text" -) -pipeline.add_stage(unicode_modifier) - -# Additional processing steps can be added as needed - -# Write cleaned data -writer = JsonlWriter(path="cleaned_data/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() - -# Cleanup Ray when done -ray_client.stop() -``` - -::: - -### Exact Deduplication Workflow - -Exact deduplication for any dataset size (requires Ray and at least 1 GPU): - -:::{dropdown} Exact Deduplication Code Example -:icon: code-square - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.stages.deduplication.exact.workflow import ExactDeduplicationWorkflow - -# Initialize Ray cluster with GPU support (required for exact deduplication) -ray_client = RayClient(num_gpus=4) -ray_client.start() - -# Configure exact deduplication workflow -exact_workflow = ExactDeduplicationWorkflow( - input_path="/path/to/input/data", - output_path="/path/to/output", - text_field="text", - perform_removal=False, # Currently only identification supported - assign_id=True, # Automatically assign unique IDs - input_filetype="parquet", -) - -# Run exact deduplication workflow -exact_workflow.run() - -# Cleanup Ray when done -ray_client.stop() -``` - -::: - -### Fuzzy Deduplication Workflow - -Critical for production datasets (requires Ray and at least 1 GPU): - -:::{dropdown} Fuzzy Deduplication Code Example -:icon: code-square - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.stages.deduplication.fuzzy.workflow import FuzzyDeduplicationWorkflow - -# Initialize Ray cluster with GPU support (required for fuzzy deduplication) -ray_client = RayClient(num_gpus=4) -ray_client.start() - -# Configure fuzzy deduplication workflow (production settings) -fuzzy_workflow = FuzzyDeduplicationWorkflow( - input_path="/path/to/input/data", - cache_path="/path/to/cache", - output_path="/path/to/output", - input_filetype="parquet", - input_blocksize="1.5GiB", - text_field="text", - perform_removal=False, # Currently only identification supported - # LSH parameters for ~80% similarity threshold - num_bands=20, # Number of LSH bands - minhashes_per_band=13, # Hashes per band - char_ngrams=24, # Character n-gram size - seed=42 -) - -# Run fuzzy deduplication workflow -fuzzy_workflow.run() - -# Cleanup Ray when done -ray_client.stop() -``` - -### Removing Identified Duplicates - -The identified duplicates can be removed using a separate workflow: - -:::{dropdown} Duplicate Removal Code Example -:icon: code-square - -```python -from nemo_curator.core.client import RayClient -from nemo_curator.stages.text.deduplication.removal_workflow import TextDuplicatesRemovalWorkflow - -# Start Ray client -ray_client = RayClient() -ray_client.start() - -# Configure workflow with input dataset and output duplicate IDs -removal_workflow = TextDuplicatesRemovalWorkflow( - input_path="/path/to/input/data", - ids_to_remove_path="/path/to/output/FuzzyDuplicateIds", - output_path="/path/to/deduplicated/output", - input_filetype="parquet", # Same as identification workflow - input_blocksize="1.5GiB", # Same as identification workflow - duplicate_id_field="_curator_dedup_id", - id_generator_path="/path/to/output/fuzzy_id_generator.json", -) - -# Run removal workflow -removal_workflow.run() - -# Cleanup Ray when done -ray_client.stop() -``` - -::: - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/index.md -```md ---- -description: "Process text data using comprehensive filtering, deduplication, content processing, and specialized tools for high-quality datasets" -categories: ["workflows"] -tags: ["data-processing", "filtering", "deduplication", "content-processing", "quality-assessment", "distributed"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "workflow" -modality: "text-only" ---- - -# Process Data for Text Curation - -Process text data you've loaded through NeMo Curator's {ref}`pipeline architecture `. - -NeMo Curator provides a comprehensive suite of tools for processing text data as part of the AI training pipeline. These tools help you analyze, transform, and filter your text datasets to ensure high-quality input for language model training. - -## How it Works - -NeMo Curator's text processing capabilities are organized into five main categories: - -1. **Language Management**: Handle multilingual content and language-specific processing -2. **Content Processing & Cleaning**: Clean, normalize, and transform text content -3. **Deduplication**: Remove duplicate and near-duplicate documents efficiently -4. **Quality Assessment & Filtering**: Score and remove low-quality content using heuristics and ML classifiers -5. **Specialized Processing**: Domain-specific processing for code and advanced curation tasks - -Each category provides specific implementations optimized for different curation needs. The result is a cleaned and filtered dataset ready for model training. - ---- - -## Language Management - -Handle multilingual content and language-specific processing requirements. - -::::{grid} 1 1 1 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`globe;1.5em;sd-mr-1` Language Identification -:link: language-management/language -:link-type: doc -Identify document languages and separate multilingual datasets -+++ -{bdg-secondary}`fasttext` -{bdg-secondary}`176-languages` -{bdg-secondary}`detection` -::: - -:::{grid-item-card} {octicon}`filter;1.5em;sd-mr-1` Stop Words -:link: language-management/stopwords -:link-type: doc -Manage high-frequency words to enhance text extraction and content detection -+++ -{bdg-secondary}`preprocessing` -{bdg-secondary}`filtering` -{bdg-secondary}`language-specific` -::: - -:::: - -## Content Processing & Cleaning - -Clean, normalize, and transform text content for high-quality training data. - -::::{grid} 1 1 1 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`typography;1.5em;sd-mr-1` Text Cleaning -:link: content-processing/text-cleaning -:link-type: doc -Fix Unicode issues, standardize spacing, and remove URLs -+++ -{bdg-secondary}`unicode` -{bdg-secondary}`normalization` -{bdg-secondary}`preprocessing` -::: - -:::: - -## Deduplication - -Remove duplicate and near-duplicate documents efficiently from your text datasets. All deduplication methods support both identification (finding duplicates) and removal (filtering them out) workflows. - -::::{grid} 1 1 1 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`git-pull-request;1.5em;sd-mr-1` Exact Duplicate Removal -:link: deduplication/exact -:link-type: doc -Identify and remove character-for-character duplicates using MD5 hashing -+++ -{bdg-secondary}`hashing` -{bdg-secondary}`fast` -{bdg-secondary}`gpu-accelerated` -::: - -:::{grid-item-card} {octicon}`git-compare;1.5em;sd-mr-1` Fuzzy Duplicate Removal -:link: deduplication/fuzzy -:link-type: doc -Identify and remove near-duplicates using MinHash and LSH similarity -+++ -{bdg-secondary}`minhash` -{bdg-secondary}`lsh` -{bdg-secondary}`gpu-accelerated` -::: - -:::{grid-item-card} {octicon}`repo-clone;1.5em;sd-mr-1` Semantic Deduplication -:link: deduplication/semdedup -:link-type: doc -Identify and remove semantically similar documents using embeddings and clustering -+++ -{bdg-secondary}`embeddings` -{bdg-secondary}`meaning-based` -{bdg-secondary}`gpu-accelerated` -::: - -:::: - -## Quality Assessment & Filtering - -Score and remove low-quality content using heuristics and ML classifiers. - -::::{grid} 1 1 1 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`filter;1.5em;sd-mr-1` Heuristic Filtering -:link: quality-assessment/heuristic -:link-type: doc -Filter text using configurable rules and metrics -+++ -{bdg-secondary}`rules` -{bdg-secondary}`metrics` -{bdg-secondary}`fast` -::: - -:::{grid-item-card} {octicon}`cpu;1.5em;sd-mr-1` Classifier Filtering -:link: quality-assessment/classifier -:link-type: doc -Filter text using trained quality classifiers -+++ -{bdg-secondary}`ml-models` -{bdg-secondary}`quality` -{bdg-secondary}`scoring` -::: - -:::{grid-item-card} {octicon}`cpu;1.5em;sd-mr-1` Distributed Classification -:link: quality-assessment/distributed-classifier -:link-type: doc -GPU-accelerated classification with pre-trained models -+++ -{bdg-secondary}`gpu` -{bdg-secondary}`distributed` -{bdg-secondary}`scalable` -::: - -:::: - -## Specialized Processing - -Domain-specific processing for code and advanced curation tasks. - -::::{grid} 1 1 1 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`code;1.5em;sd-mr-1` Code Processing -:link: specialized-processing/code -:link-type: doc -Specialized filters for programming content and source code -+++ -{bdg-secondary}`programming` -{bdg-secondary}`syntax` -{bdg-secondary}`comments` -::: - -:::: - -```{toctree} -:maxdepth: 4 -:titlesonly: -:hidden: - -Language Management -Content Processing & Cleaning -Deduplication -Quality Assessment & Filtering -Specialized Processing -``` - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/language-management/language.md -```md ---- -description: "Identify document languages accurately using FastText models supporting 176 languages for multilingual text processing" -categories: ["how-to-guides"] -tags: ["language-identification", "fasttext", "multilingual", "176-languages", "detection", "classification"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "how-to" -modality: "text-only" ---- - -# Language Identification - -(text-process-data-languages-id)= - -Large unlabeled text corpora often contain a variety of languages. NVIDIA NeMo Curator provides tools to accurately identify the language of each document, which is essential for language-specific curation tasks and building high-quality monolingual datasets. - -## How it Works - -NeMo Curator's language identification system works through a three-step process: - -1. **Text Preprocessing**: For FastText classification, normalize input text by stripping whitespace and converting newlines to spaces. - -2. **FastText Language Detection**: The pre-trained FastText language identification model ([`lid.176.bin`](https://fasttext.cc/docs/en/language-identification.html)) analyzes the preprocessed text and returns: - - A confidence score (0.0 to 1.0) indicating certainty of the prediction - - A language code (for example, "EN", "ES", "FR") in FastText's two-letter uppercase format - -3. **Filtering and Scoring**: The pipeline filters documents based on a configurable confidence threshold (`min_langid_score`) and stores both the confidence score and language code as metadata. - -### Language Detection Process - -The `FastTextLangId` filter implements this workflow by: - -- Loading the FastText language identification model on worker initialization -- Processing text through `model.predict()` with `k=1` to get the top language prediction -- Extracting the language code from FastText labels (for example, `__label__en` becomes "EN") -- Comparing confidence scores against the threshold to determine document retention -- Returning results as `[confidence_score, language_code]` for downstream processing - -This approach supports **176 languages** with high accuracy, making it suitable for large-scale multilingual dataset curation where language-specific processing and monolingual dataset creation are critical. - -## Usage - -The following example demonstrates how to create a language identification pipeline using Curator with distributed processing. - -::::{tab-set} - -:::{tab-item} Python - -```python -"""Language identification using Curator.""" - -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.filters.fasttext import FastTextLangId -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.filters import ScoreFilter - -def create_language_identification_pipeline(data_dir: str) -> Pipeline: - """Create a pipeline for language identification.""" - - # Define pipeline - pipeline = Pipeline( - name="language_identification", - description="Identify document languages using FastText" - ) - - # Add stages - # 1. Reader stage - creates tasks from JSONL files - pipeline.add_stage( - JsonlReader( - file_paths=data_dir, - files_per_partition=2, # Each task processes 2 files - ) - ) - - # 2. Language identification with filtering - # IMPORTANT: Download lid.176.bin or lid.176.ftz from https://fasttext.cc/docs/en/language-identification.html - fasttext_model_path = "/path/to/lid.176.bin" # or lid.176.ftz (compressed) - pipeline.add_stage( - ScoreFilter( - FastTextLangId(model_path=fasttext_model_path, min_langid_score=0.3), - score_field="language" - ) - ) - - return pipeline - -def main(): - # Create pipeline - pipeline = create_language_identification_pipeline("./data") - - # Print pipeline description - print(pipeline.describe()) - - # Create executor and run - results = pipeline.run() - - # Process results - - total_documents = sum(task.num_items for task in results) if results else 0 - print(f"Total documents processed: {total_documents}") - - # Access language scores - for i, batch in enumerate(results): - if batch.num_items > 0: - df = batch.to_pandas() - print(f"Batch {i} columns: {list(df.columns)}") - # Language scores are now in the 'language' field - -if __name__ == "__main__": - main() -``` - -::: -:::: - -## Understanding Results - -The language identification process adds a score field to each document batch: - -1. **`language` field**: Contains the FastText language identification results as a string representation of a list with two elements (for backend compatibility): - - Element 0: The confidence score (between 0 and 1) - - Element 1: The language code in FastText format (for example, "EN" for English, "ES" for Spanish) - -2. **Task-based processing**: Curator processes documents in batches (tasks), and results are available through the task's Pandas DataFrame: - -```python -# Access results from pipeline execution -for batch in results: - df = batch.to_pandas() - # Language scores are in the 'language' column - print(df[['text', 'language']].head()) -``` - -:::{tip} -For quick exploratory inspection, converting a `DocumentBatch` to a Pandas DataFrame is fine. For performance and scalability, write transformations as `ProcessingStage`s (or with the `@processing_stage` decorator) and run them inside a `Pipeline` with an executor. Curator’s parallelism and resource scheduling apply when code runs as pipeline stages; ad‑hoc Pandas code executes on the driver and will not scale. -::: - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/quality-assessment/classifier.md -```md ---- -description: "Filter text using trained quality classifiers including FastText models and pre-trained language classification" -categories: ["how-to-guides"] -tags: ["classifier-filtering", "fasttext", "ml-models", "quality", "training", "scoring"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "how-to" -modality: "text-only" ---- - -(text-process-data-filter-classifier)= - -# Classifier-Based Filtering - -Classifier-based filtering uses machine learning models to differentiate between high-quality and low-quality documents. NVIDIA NeMo Curator implements an approach similar to the one described in [Brown et al., 2020](https://arxiv.org/abs/2005.14165), which trains a binary skip-gram classifier to distinguish between curated high-quality data and lower-quality data. - -## How It Works - -Classifier-based filtering learns the characteristics of high-quality documents from training data, unlike heuristic filtering which relies on predefined rules and thresholds. This approach is particularly effective when: - -- You have a reference dataset of known high-quality documents -- The distinction between high and low quality is complex or subtle -- You want to filter based on domain-specific characteristics - -NVIDIA NeMo Curator uses [fastText](https://fasttext.cc/) for implementing classifier-based filtering, which offers excellent performance and scalability for text classification tasks. - -:::{note} -fastText is the official name and capitalization used by the fastText library created by Facebook Research. -::: - -The classifier-based filtering process involves: - -1. Preparing training data by sampling from high-quality and low-quality datasets -2. Training a binary skip-gram classifier using fastText -3. Using the trained model to score documents in your dataset -4. Filtering documents based on the classifier scores, optionally using Pareto-based sampling - ---- - -## Usage - - -NeMo Curator provides two approaches for quality assessment: - -1. **Classification**: Use `QualityClassifier` to add quality predictions and optionally filter during classification -2. **Filtering**: Use `FastTextQualityFilter` with `ScoreFilter` for document-level filtering with Pareto sampling - -:::{note} -If you need to train custom fastText models for specific domains or requirements, refer to the [fastText documentation](https://fasttext.cc/docs/en/supervised-tutorial.html) for comprehensive training guides. -::: - -::::{tab-set} - -:::{tab-item} DeBERTa Quality Classification - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import QualityClassifier - -# Create pipeline with DeBERTa quality classifier -pipeline = Pipeline(name="deberta_quality_pipeline") - -# Add stages -read_stage = JsonlReader("input_data/") -classify_stage = QualityClassifier( - filter_by=["High"], # Keep only high-quality documents - model_inference_batch_size=256, - max_chars=6000 # Default value -) -write_stage = JsonlWriter("high_quality_output/") - -pipeline.add_stage(read_stage) -pipeline.add_stage(classify_stage) -pipeline.add_stage(write_stage) - -# Execute pipeline -results = pipeline.run() -``` - -::: - -:::{tab-item} FastText Quality Filter - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.filters import ScoreFilter -from nemo_curator.stages.text.filters.fasttext import FastTextQualityFilter - -# Create pipeline with FastText filter (requires pre-trained model) -pipeline = Pipeline(name="fasttext_quality_pipeline") - -# Add stages -read_stage = JsonlReader("input_data/") -filter_stage = ScoreFilter( - FastTextQualityFilter( - model_path="./quality_classifier.bin", # Path to your fastText model - label="__label__hq", # High quality label - alpha=3, # Pareto distribution alpha parameter - seed=42 # Random seed for reproducibility - ), - text_field="text", - score_field="quality_score" -) -write_stage = JsonlWriter("high_quality_output/") - -pipeline.add_stage(read_stage) -pipeline.add_stage(filter_stage) -pipeline.add_stage(write_stage) - -# Execute pipeline -results = pipeline.run() -``` - -::: - -:::{tab-item} Configuration - -You can configure quality classifiers and filters with different parameters: - -```python -from nemo_curator.stages.text.classifiers import QualityClassifier -from nemo_curator.stages.text.filters.fasttext import FastTextQualityFilter - -# DeBERTa quality classifier configurations -basic_deberta_classifier = QualityClassifier( - filter_by=["High"], # Keep only high-quality documents - model_inference_batch_size=256, - max_chars=6000 # Default value -) - -# More inclusive DeBERTa classifier -inclusive_deberta_classifier = QualityClassifier( - filter_by=["Medium", "High"], # Keep medium and high-quality documents - model_inference_batch_size=128, - max_chars=6000 -) - -# FastText quality filter configurations -basic_fasttext_filter = FastTextQualityFilter( - model_path="./quality_classifier.bin", - label="__label__hq", # High quality label - alpha=3, # Pareto distribution alpha parameter - seed=42 # Random seed for reproducibility -) - -# More selective FastText filter -selective_fasttext_filter = FastTextQualityFilter( - model_path="./quality_classifier.bin", - label="__label__hq", - alpha=5, # Higher alpha for stricter filtering - seed=42 -) -``` - -::: - -:::: - -## Quality Classifier and Filter Parameters - -### QualityClassifier (DeBERTa) - -The `QualityClassifier` accepts the following parameters: - -- `filter_by` (list, default=None): Quality levels to keep (options: "Low", "Medium", "High") -- `model_inference_batch_size` (int, default=256): Batch size for inference -- `max_chars` (int, default=6000): Max characters per document for processing -- `label_field` (str, default="quality_pred"): Name of the prediction column -- `text_field` (str, default="text"): Name of the text field in input data - -### FastTextQualityFilter - -The `FastTextQualityFilter` accepts the following parameters: - -- `model_path` (str, required): Path to the trained fastText model file -- `label` (str, default="__label__hq"): The label for high-quality documents -- `alpha` (float, default=3): Alpha parameter for Pareto distribution sampling -- `seed` (int, default=42): Random seed for reproducible sampling - -## Best Practices - -For effective classifier-based filtering: - -1. **Model selection**: Start with the DeBERTa quality classifier for general use cases; consider fastText for high-throughput scenarios -2. **Validation**: Manually review a sample of filtered results to confirm effectiveness -3. **Quality level tuning**: Adjust `filter_by` levels (DeBERTa) or `alpha` values (fastText) based on your quality requirements -4. **Batch size optimization**: Tune `model_inference_batch_size` for DeBERTa models based on your available memory -5. **Combination with heuristics**: Consider using heuristic filters as a pre-filter to improve efficiency -6. **Domain adaptation**: For specialized corpora, consider training custom models using domain-specific data - -``` - -File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/quality-assessment/distributed-classifier.md -```md ---- -description: "Perform distributed data classification using GPU-accelerated models for domain, quality, safety, and content assessment" -categories: ["how-to-guides"] -tags: ["distributed-classification", "gpu", "domain", "quality", "safety", "crossfit", "scalable"] -personas: ["data-scientist-focused", "mle-focused"] -difficulty: "intermediate" -content_type: "how-to" -modality: "text-only" ---- - -(text-process-data-filter-dist-classifier)= - -# Distributed Data Classification - -NVIDIA NeMo Curator provides a module for performing distributed classification on large text datasets using GPU acceleration. This enables the categorization and filtering of text documents based on multiple dimensions such as domain, quality, safety, educational value, content type, and more. These classifications can enhance the quality of training data for large language models by identifying high-value content and removing problematic material. - -## How It Works - -The distributed data classification in NeMo Curator works by: - -1. **Parallel Processing**: Chunking datasets across multiple computing nodes and GPUs to accelerate classification -2. **Pre-trained Models**: Using specialized models for different classification tasks -3. **Batched Inference**: Optimizing throughput with intelligent batching -4. **Consistent API**: Providing a unified interface through the `DistributedDataClassifier` base class - -The `DistributedDataClassifier` is designed to run on GPU clusters with minimal code changes regardless of which specific classifier you're using. All classifiers support filtering based on classification results and storing prediction scores as metadata. - -:::{note} -Distributed classification requires GPU acceleration and is not supported for CPU-only processing. As long as GPU resources are available and NeMo Curator is correctly installed, GPU acceleration is handled automatically. -::: - -```{tip} -**Running the tutorial notebooks**: The classification tutorial notebooks require the `text_cuda12` or `all` installation extra to include all relevant dependencies. If you encounter `ModuleNotFoundError`, reinstall with the appropriate extra: - - uv pip install "nemo-curator[text_cuda12]" - -When using classifiers that download from Hugging Face (such as Aegis and InstructionDataGuard), set your `HF_TOKEN` environment variable to avoid rate limiting: - - export HF_TOKEN="your_token_here" -``` - ---- - -## Usage - -NVIDIA NeMo Curator provides a base class `DistributedDataClassifier` that can be extended to fit your specific model. The only requirement is that the model can fit on a single GPU. This module operates on the GPU and works within the pipeline framework using DocumentBatch processing. - -### Classifier Comparison - -| Classifier | Purpose | Model Location | Key Parameters | Requirements | -|---|---|---|---|---| -| DomainClassifier | Assigns one of 26 domain labels (such as "Sports," "Science," "News") to English text | [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier) | `filter_by`, `text_field` | None | -| MultilingualDomainClassifier | Assigns domain labels to text in 52 languages; same labels as DomainClassifier | [nvidia/multilingual-domain-classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) | `filter_by`, `text_field` | None | -| QualityClassifier | Rates document quality as "Low," "Medium," or "High" using a DeBERTa model | [nvidia/quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) | `filter_by`, `text_field` | None | -| AegisClassifier | Detects unsafe content across 13 risk categories (violence, hate speech, and others) using LlamaGuard | [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) | `aegis_variant`, `filter_by` | HuggingFace token | -| InstructionDataGuardClassifier | Identifies LLM poisoning attacks in instruction-response pairs | [nvidia/instruction-data-guard](https://huggingface.co/nvidia/instruction-data-guard) | `text_field`, `label_field` | HuggingFace token | -| FineWebEduClassifier | Scores educational value from 0 to 5 (0=spam, 5=scholarly) for training data selection | [HuggingFaceFW/fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) | `label_field`, `int_field` | None | -| FineWebMixtralEduClassifier | Scores educational value from 0 to 5 using Mixtral 8x22B annotation data | [nvidia/nemocurator-fineweb-mixtral-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier) | `label_field`, `int_field`, `model_inference_batch_size=1024` | None | -| FineWebNemotronEduClassifier | Scores educational value from 0 to 5 using Nemotron-4-340B annotation data | [nvidia/nemocurator-fineweb-nemotron-4-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier) | `label_field`, `int_field`, `model_inference_batch_size=1024` | None | -| ContentTypeClassifier | Categorizes text into 11 speech types (such as "Blogs," "News," "Academic") | [nvidia/content-type-classifier-deberta](https://huggingface.co/nvidia/content-type-classifier-deberta) | `filter_by`, `text_field` | None | -| PromptTaskComplexityClassifier | Labels prompts by task type (such as QA and summarization) and complexity dimensions | [nvidia/prompt-task-and-complexity-classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) | `text_field` | None | - -### Domain Classifier - -The Domain Classifier categorizes English text documents into specific domains or subject areas. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import DomainClassifier - -# Create pipeline -pipeline = Pipeline(name="domain_classification") - -# Load dataset -reader = JsonlReader( - file_paths="books_dataset/", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the classifier, filtering for specific domains -domain_classifier = DomainClassifier(filter_by=["Games", "Sports"]) -pipeline.add_stage(domain_classifier) - -# Save the results -writer = JsonlWriter(path="games_and_sports/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -### Multilingual Domain Classifier - -Functionally similar to the Domain Classifier, but supports 52 languages. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import MultilingualDomainClassifier - -pipeline = Pipeline(name="multilingual_domain_classification") -pipeline.add_stage(JsonlReader(file_paths="multilingual_dataset/", fields=["text", "id"])) -pipeline.add_stage(MultilingualDomainClassifier(filter_by=["Games", "Sports"])) -pipeline.add_stage(JsonlWriter(path="classified_output/")) - -results = pipeline.run() # Uses XennaExecutor by default -``` - -### Quality Classifier - -The Quality Classifier assesses document quality using the NVIDIA Quality Classifier DeBERTa model. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import QualityClassifier - -pipeline = Pipeline(name="quality_classification") -pipeline.add_stage(JsonlReader(file_paths="web_documents/", fields=["text", "id"])) -pipeline.add_stage(QualityClassifier()) -pipeline.add_stage(JsonlWriter(path="quality_classified/")) - -results = pipeline.run() # Uses XennaExecutor by default -``` - -:::{note} -The exact label categories returned by the Quality Classifier depend on the model configuration. Check the prediction column in your results to see the available labels for filtering with the `filter_by` parameter. -::: - -### AEGIS Safety Classifier - -The AEGIS classifier detects unsafe content across 13 critical risk categories. It requires a HuggingFace token for access to Llama Guard. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import AegisClassifier - -# Create pipeline -pipeline = Pipeline(name="aegis_classification") - -# Load dataset -reader = JsonlReader( - file_paths="content/", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the AEGIS classifier -token = "hf_1234" # Your HuggingFace user access token -safety_classifier = AegisClassifier( - aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0", - hf_token=token, - filter_by=["safe", "O13"] # Keep only safe content and "needs caution" category -) -pipeline.add_stage(safety_classifier) - -# Save the results -writer = JsonlWriter(path="safe_content/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -The classifier adds a column with labels: "safe," "O1" through "O13" (each representing specific safety risks), or "unknown." For raw LLM output, use: - -```python -safety_classifier = AegisClassifier( - aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0", - hf_token=token, - keep_raw_output=True, - raw_output_field="raw_predictions" -) -``` - -### Instruction Data Guard - -Detects LLM poisoning attacks in instruction-response datasets. Requires HuggingFace token access. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import InstructionDataGuardClassifier - -# Create pipeline -pipeline = Pipeline(name="instruction_data_guard") - -# Load dataset -# For instruction-response data: "Instruction: {instruction}. Input: {input_}. Response: {response}." -reader = JsonlReader( - file_paths="instruction_data/", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the classifier -token = "hf_1234" # Your HuggingFace user access token -classifier = InstructionDataGuardClassifier(hf_token=token) -pipeline.add_stage(classifier) - -# Save the results -writer = JsonlWriter(path="guard_classified/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -The output includes two columns: a float score `instruction_data_guard_poisoning_score` and a Boolean `is_poisoned`. - -### FineWeb Educational Content Classifier - -Scores documents on educational value from 0–5. This helps prioritize content for knowledge-intensive tasks. - -#### Score Ranges and Meanings - -| Score | Label | Description | Example Content | -|-------|-------|-------------|-----------------| -| 0-1 | Very Low | No educational value | Spam, advertisements, broken content | -| 2 | Low | Minimal educational content | Simple lists, basic product descriptions | -| 3 | Moderate | Some educational value | News articles, basic how-to guides | -| 4 | High | Good educational content | Detailed tutorials, academic discussions | -| 5 | Very High | Excellent educational material | Comprehensive guides, scholarly articles | - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import FineWebEduClassifier - -# Create pipeline -pipeline = Pipeline(name="fineweb_edu_classification") - -# Load dataset -reader = JsonlReader( - file_paths="web_documents/*.jsonl", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the FineWeb Edu classifier -edu_classifier = FineWebEduClassifier( - model_inference_batch_size=256, - float_score_field="fineweb-edu-score-float", # Raw float scores - int_score_field="fineweb-edu-score-int", # Rounded integer scores - label_field="fineweb-edu-score-label" # Quality labels -) -pipeline.add_stage(edu_classifier) - -# Save the results -writer = JsonlWriter(path="edu_classified/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -### FineWeb Mixtral and Nemotron Edu Classifiers - -Similar to the FineWeb Edu Classifier but trained with different annotation sources: - -- **FineWebMixtralEduClassifier**: Uses annotations from Mixtral 8x22B-Instruct -- **FineWebNemotronEduClassifier**: Uses annotations from Nemotron-4-340B-Instruct - -Both provide a quality label column marking scores above 2.5 as "high_quality": - -#### Quality Label Mapping - -| Score Range | Quality Label | Description | -|-------------|---------------|-------------| -| 0.0 - 2.5 | `low_quality` | Below average educational value | -| 2.5 - 5.0 | `high_quality` | Above average educational value | - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import FineWebMixtralEduClassifier # or FineWebNemotronEduClassifier - -# Create pipeline -pipeline = Pipeline(name="fineweb_mixtral_edu_classification") - -# Load dataset -reader = JsonlReader( - file_paths="web_documents/*.jsonl", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the FineWeb Mixtral Edu classifier -classifier = FineWebMixtralEduClassifier( - float_score_field="fineweb-mixtral-edu-score-float", # Raw float scores - int_score_field="fineweb-mixtral-edu-score-int", # Rounded integer scores - label_field="fineweb-mixtral-edu-score-label" # "high_quality" or "low_quality" -) -pipeline.add_stage(classifier) - -# Save the results -writer = JsonlWriter(path="mixtral_edu_classified/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -### Content Type Classifier - -Categorizes documents into 11 distinct speech types. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import ContentTypeClassifier - -# Create pipeline -pipeline = Pipeline(name="content_type_classification") - -# Load dataset -reader = JsonlReader( - file_paths="content/", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the Content Type classifier -classifier = ContentTypeClassifier(filter_by=["Blogs", "News"]) -pipeline.add_stage(classifier) - -# Save the results -writer = JsonlWriter(path="content_type_classified/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -### Prompt Task and Complexity Classifier - -Classifies prompts by task type and complexity dimensions. - -```python -from nemo_curator.pipeline import Pipeline -from nemo_curator.stages.text.io.reader import JsonlReader -from nemo_curator.stages.text.io.writer import JsonlWriter -from nemo_curator.stages.text.classifiers import PromptTaskComplexityClassifier - -# Create pipeline -pipeline = Pipeline(name="prompt_task_complexity_classification") - -# Load dataset -reader = JsonlReader( - file_paths="prompts/", - fields=["text", "id"] -) -pipeline.add_stage(reader) - -# Apply the Prompt Task Complexity classifier -classifier = PromptTaskComplexityClassifier() -pipeline.add_stage(classifier) - -# Save the results -writer = JsonlWriter(path="prompt_complexity_classified/") -pipeline.add_stage(writer) - -# Execute pipeline -results = pipeline.run() # Uses XennaExecutor by default -``` - -## Custom Model Integration - -You can integrate your own classification models by extending `DistributedDataClassifier`. Refer to the [Text Classifiers README](https://github.com/NVIDIA-NeMo/Curator/tree/main/nemo_curator/stages/text/classifiers#text-classifiers) for implementation details and examples. - -## Performance Optimization - -NVIDIA NeMo Curator's distributed classifiers are optimized for high-throughput processing through several key features: - -### CPU-based tokenization and GPU-based model inference - -Each classifier is broken down under the hood into a tokenizer stage and a model inference stage. Tokenization is run on the CPU while model inference is run on the GPU. For example, this means that behind the scenes, the `DomainClassifier` stage is actually being broken down into 2 stages (some parameters and details omitted to avoid complexity): - -```python -class TokenizerStage: - self.resources = Resources(cpus=1) - self.model_identifier = "nvidia/domain-classifier" - self.text_field = "text" - self.padding_side = "right" - ... -class ModelStage: - self.resources = Resources(cpus=1, gpus=1) - self.model_identifier = "nvidia/domain-classifier" - self.model_inference_batch_size = 256 - ... -``` - -Pipelines take care of resource allocation and autoscaling to achieve enhanced performance and minimize GPU idleness. This means that we are able to achieve speedups by ensuring that model inference is run in parallel across all available GPUs, while other stages such as I/O, tokenization, and filtering are run across all available CPUs. This is possible because Curator pipelines are composable, which allows each stage in a pipeline to run independently and with its own specified hardware resources. - -### Intelligent Batching and Sequence Handling - -The classifiers optimize throughput through: - -- **Length-based sorting**: Input sequences are sorted by length when `sort_by_length=True` (default) -- **Efficient batching**: Similar-length sequences are grouped together to minimize padding overhead -- **GPU memory optimization**: Batches are sized to maximize GPU utilization based on available memory - -### Avoid Unnecessary Re-Tokenization - -Several of the text classifiers use the same tokenizer before running the model forward pass. To avoid unnecessary re-tokenization, the `keep_tokens` and `use_existing_tokens` parameters can be used. - -**Important: Not every text classifier uses the same tokenizer, so it is important to confirm that classifiers' tokenizers are compatible with each other. Curator will not verify this for you.** - -The `ContentTypeClassifier`, `QualityClassifier`, `DomainClassifier`, and `PromptTaskComplexityClassifier` all use a DeBERTa tokenizer, which means that we only need to tokenize once. To avoid unnecessary re-tokenization, you can do: - -```python -# Since this is the first classifier in the pipeline, there are no existing tokens to use, -# but we can make sure to keep the computed tokens for the next classifier -content_type_classifier = ContentTypeClassifier(use_existing_tokens=False, keep_tokens=True, ...) -pipeline.add_stage(content_type_classifier) - -# Use tokens from the previous classifier and keep tokens for the next classifier -quality_classifier = QualityClassifier(use_existing_tokens=True, keep_tokens=True, ...) -pipeline.add_stage(quality_classifier) - -# Use tokens from the previous classifier and keep tokens for the next classifier -domain_classifier = DomainClassifier(use_existing_tokens=True, keep_tokens=True, ...) -pipeline.add_stage(domain_classifier) - -# Use tokens from the previous classifier -# Since this is the final classifier in the pipeline, we drop the computed tokens -prompt_task_complexity_classifier = PromptTaskComplexityClassifier(use_existing_tokens=True, keep_tokens=False, ...) -pipeline.add_stage(prompt_task_complexity_classifier) -``` - -In addition to the above example, the `FineWebEduClassifier`, `FineWebMixtralEduClassifier`, and `FineWebNemotronEduClassifier` are all compatible with each other: - -```python -fineweb_classifier = FineWebEduClassifier(use_existing_tokens=False, keep_tokens=True, ...) -pipeline.add_stage(fineweb_classifier) - -fineweb_mixtral_classifier = FineWebMixtralEduClassifier(use_existing_tokens=True, keep_tokens=True, ...) -pipeline.add_stage(fineweb_mixtral_classifier) - -fineweb_nemotron_classifier = FineWebNemotronEduClassifier(use_existing_tokens=True, keep_tokens=False, ...) -pipeline.add_stage(fineweb_nemotron_classifier) -``` - -The `AegisClassifier` variants ([nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0)) are compatible with each other as well. This example is a bit more complex because it also involves keeping the formatted Aegis prompt field. See the `AegisClassifier` implementation for more details. - -```python -aegis_defensive_classifier = AegisClassifier( - aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0", - label_field="aegis_defensive_pred", - use_existing_tokens=False, - keep_tokens=True, - keep_aegis_prompt_field=True, - ... -) -pipeline.add_stage(aegis_defensive_classifier) - -aegis_permissive_classifier = AegisClassifier( - aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0", - label_field="aegis_permissive_pred", - use_existing_tokens=True, - aegis_prompt_field="_curator_hidden_text", # created by aegis_defensive_classifier - keep_tokens=False, - keep_aegis_prompt_field=False, - ... -) -pipeline.add_stage(aegis_permissive_classifier) -``` - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/filters/doc_filter.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from abc import ABC, abstractmethod - - -class DocumentFilter(ABC): - """ - An abstract base class for text-based document filters. - - This class serves as a template for creating specific document filters - in the library. Subclasses should implement the abstract methods to - define custom filtering behavior. - """ - - def __init__(self): - super().__init__() - self._name = self.__class__.__name__ - self._sentences = None - self._paragraphs = None - self._ngrams = None - - @abstractmethod - def score_document(self, text: str) -> float | list[int | float]: - """ - Calculate a score for the given document text. - - This method should be implemented by subclasses to define how - a document's text is evaluated and scored. - - Args: - text (str): The text content of the document to be scored. - - Returns: - Any: A score or set of scores representing the document's - relevance or quality. The type and structure of the - return value should be consistent for each subclass. - - Raises: - NotImplementedError: If the method is not implemented in a subclass. - """ - msg = "score_document method must be implemented by subclasses" - raise NotImplementedError(msg) - - @abstractmethod - def keep_document(self, scores: float | list[int | float]) -> bool: - """ - Determine whether to keep a document based on its scores. - - This method should be implemented by subclasses to define the - criteria for keeping or discarding a document based on the - scores calculated by score_document(). - - Args: - scores (float | list[int | float]): The score or set of scores returned by score_document(). - The type should match what is returned by score_document(). - - Returns: - bool: True if the document should be kept, False otherwise. - - Raises: - NotImplementedError: If the method is not implemented in a subclass. - """ - msg = "keep_document method must be implemented by subclasses" - raise NotImplementedError(msg) - - @property - def name(self) -> str: - return self._name - - @property - def sentences(self) -> list: - return self._sentences - - @sentences.setter - def sentences(self, sentences: list) -> None: - self._sentences = sentences - - @property - def paragraphs(self) -> list: - return self._paragraphs - - @paragraphs.setter - def paragraphs(self, paragraphs: list) -> None: - self._paragraphs = paragraphs - - @property - def ngrams(self) -> dict: - return self._ngrams - - @ngrams.setter - def ngrams(self, ngrams: dict) -> None: - self._ngrams = ngrams - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/filters/score_filter.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import Callable -from dataclasses import dataclass -from typing import Any, Literal - -import pandas as pd -from loguru import logger - -from nemo_curator.backends.base import NodeInfo, WorkerMetadata -from nemo_curator.stages.base import ProcessingStage -from nemo_curator.stages.text.filters.doc_filter import DocumentFilter -from nemo_curator.tasks import DocumentBatch - - -@dataclass -class Score(ProcessingStage[DocumentBatch, DocumentBatch]): - """ - The module responsible for adding metadata to records based on statistics about the text. - It accepts an arbitrary scoring function that accepts a text field and returns a score. - It also accepts a DocumentFilter object, in which case the score_fn will be the score_document method of the DocumentFilter. - - Unlike ScoreFilter, it does not filter based on the computed score. - It only adds metadata to the record. - - If a list of DocumentFilters is provided, the filters are applied in order. - In this case, the score_field parameter should be a list of strings corresponding to the filters. - If different filters should be applied to different text fields, then text_field should be a list of strings corresponding to the filters. - - Args: - score_fn (Callable | DocumentFilter | list[DocumentFilter]): The score function or the DocumentFilter object (or list of DocumentFilters). If it is a DocumentFilter object, the score_fn will be the score_document method of the DocumentFilter. - score_field (str | list[str]): The field (or list of fields) the score will be stored in. - text_field (str | list[str]): The field (or list of fields) the documents will be read from. - - """ - - score_fn: Callable[[str], float | str] | DocumentFilter | list[DocumentFilter] - score_field: str | list[str] - text_field: str | list[str] = "text" - name: str = "score_fn" - - def __post_init__(self): - self.name, self.score_fn, self.text_field, _, self.score_field = _validate_and_normalize_filters( - self.score_fn, self.text_field, None, self.score_field, "score" - ) - - def inputs(self) -> tuple[list[str], list[str]]: - return ["data"], self.text_field - - def outputs(self) -> tuple[list[str], list[str]]: - return ["data"], self.text_field + self.score_field - - def ray_stage_spec(self) -> dict[str, Any]: - requires_setup = any( - hasattr(score_fn, "load_model") or hasattr(score_fn, "load_tokenizer") - for score_fn in self.score_fn - if isinstance(score_fn, DocumentFilter) - ) - return {"is_actor_stage": requires_setup} - - def setup_on_node( - self, - _node_info: NodeInfo | None = None, - _worker_metadata: WorkerMetadata | None = None, - ) -> None: - for score_fn in self.score_fn: - if isinstance(score_fn, DocumentFilter) and hasattr(score_fn, "model_check_or_download"): - score_fn.model_check_or_download() - - def setup(self, _: WorkerMetadata | None = None) -> None: - for score_fn in self.score_fn: - if isinstance(score_fn, DocumentFilter): - if hasattr(score_fn, "load_model"): - score_fn.load_model() - if hasattr(score_fn, "load_tokenizer"): - score_fn.load_tokenizer() - - def process(self, batch: DocumentBatch) -> DocumentBatch | None: - """ - Applies the scoring to a dataset - - Args: - batch (DocumentBatch): The batch to apply the module to - - Returns: - DocumentBatch: A batch with the new score - - """ - df = batch.to_pandas() - - if df.empty: - logger.info(f"Empty dataset for batch {batch.task_id}") - return batch - - for score_fn_i, text_field_i, score_field_i in zip( - self.score_fn, self.text_field, self.score_field, strict=True - ): - inner_score_fn = score_fn_i.score_document if isinstance(score_fn_i, DocumentFilter) else score_fn_i - df[score_field_i] = df[text_field_i].apply(inner_score_fn) - - # Create output batch - return DocumentBatch( - task_id=f"{batch.task_id}_{self.name}", - dataset_name=batch.dataset_name, - data=df, - _metadata=batch._metadata, - _stage_perf=batch._stage_perf, - ) - - -@dataclass -class Filter(ProcessingStage[DocumentBatch, DocumentBatch]): - """ - The module responsible for filtering records based on a metadata field. - It accepts an arbitrary filter function that accepts a metadata field and returns True if the field should be kept. - It also accepts a DocumentFilter object, in which case the filter_fn will be the keep_document method of the DocumentFilter. - Unlike ScoreFilter, it does not compute the metadata based on a document. - It only filters using existing metadata. - - If a list of DocumentFilters is provided, the filters are applied in order. - In this case, the filter_field parameter should be a list of strings corresponding to the filters. - If some filters should be inverted and others not, then invert should be a list of booleans corresponding to the filters. - - Args: - filter_fn (Callable | DocumentFilter | list[DocumentFilter]): A function (or list of functions) that returns True if the document is to be kept or a DocumentFilter object, - in which case the filter_fn will be the keep_document method of the DocumentFilter. - filter_field (str | list[str]): The field (or list of fields) to be passed into the filter function. - invert (bool | list[bool]): Whether to invert the filter condition. - - """ - - filter_fn: Callable | DocumentFilter | list[DocumentFilter] - filter_field: str | list[str] - invert: bool | list[bool] = False - name: str = "filter_fn" - - def __post_init__(self): - self.name, self.filter_fn, self.filter_field, self.invert, _ = _validate_and_normalize_filters( - self.filter_fn, self.filter_field, self.invert, None, "filter" - ) - - def inputs(self) -> tuple[list[str], list[str]]: - return ["data"], self.filter_field - - def outputs(self) -> tuple[list[str], list[str]]: - return ["data"], self.filter_field - - def compute_filter_mask( - self, df: pd.DataFrame, filter_fn: Callable | DocumentFilter, filter_field: str, invert: bool - ) -> pd.Series: - """Compute the bool mask to filter the dataset. - - Args: - df (pd.DataFrame): The dataset to compute filter mask on. - filter_fn (Callable | DocumentFilter): The filter function to use. - filter_field (str): The field to read the filter from. - invert (bool): Whether to invert the filter condition. - - Returns: - Series: A mask corresponding to each data instance indicating whether it will be retained. - - """ - - if isinstance(filter_fn, DocumentFilter): - filter_fn = filter_fn.keep_document - - bool_mask = df[filter_field].apply(filter_fn) - - if invert: - bool_mask = ~bool_mask - - return bool_mask - - def process(self, batch: DocumentBatch) -> DocumentBatch | None: - """ - Applies the filtering to a dataset - - Args: - batch (DocumentBatch): The batch to apply the module to - - Returns: - DocumentBatch: A batch with entries removed according to the filter - - """ - df = batch.to_pandas() - - if df.empty: - logger.info(f"Empty dataset for batch {batch.task_id}") - return batch - - for filter_fn_i, filter_field_i, invert_i in zip(self.filter_fn, self.filter_field, self.invert, strict=True): - bool_mask = self.compute_filter_mask(df, filter_fn_i, filter_field_i, invert_i) - df = df[bool_mask] - - if len(df) == 0: - logger.info(f"All documents filtered out for batch {batch.task_id}") - - # Create output batch - return DocumentBatch( - task_id=f"{batch.task_id}_{self.name}", - dataset_name=batch.dataset_name, - data=df, - _metadata=batch._metadata, - _stage_perf=batch._stage_perf, - ) - - -@dataclass -class ScoreFilter(ProcessingStage[DocumentBatch, DocumentBatch]): - """ - The module responsible for applying a filter (or chain of filters) to all documents in a dataset. - It accepts an arbitrary DocumentFilter and first computes the score for a document. - Then, determines whether to keep the document based on the criteria in the DocumentFilter. - - The filter can be applied to any field in the dataset, and the score can be logged for later. - Also, the filter can be inverted such that "rejected" documents are kept. - - If a list of DocumentFilters is provided, the filters are applied in order. - If different filters should be applied to different text fields, then text_field should be a list of strings corresponding to the filters. - If different score fields should be created for each filter, then score_field should be a list of strings corresponding to the filters. - If some filters should be inverted and others not, then invert should be a list of booleans corresponding to the filters. - - Args: - filter_obj (DocumentFilter | list[DocumentFilter]): The score function (or list of score functions) that takes in a document string and outputs a score for the document. - text_field (str | list[str]): The field (or list of fields) the documents will be read from. - score_field (str | list[str] | None): The field (or list of fields) to which the scores will be written. If None, scores will be immediately discarded after use. - invert (bool | list[bool]): If True, will keep all documents that are normally discarded. - - """ - - filter_obj: DocumentFilter | list[DocumentFilter] - text_field: str | list[str] = "text" - score_field: str | list[str] | None = None - invert: bool | list[bool] = False - name: str = "score_filter" - - def __post_init__(self): - self.name, self.filter_obj, self.text_field, self.invert, self.score_field = _validate_and_normalize_filters( - self.filter_obj, self.text_field, self.invert, self.score_field, "score_filter" - ) - - def inputs(self) -> tuple[list[str], list[str]]: - return ["data"], self.text_field - - def outputs(self) -> tuple[list[str], list[str]]: - return ["data"], self.text_field + self.score_field if self.score_field is not None else [] - - def ray_stage_spec(self) -> dict[str, Any]: - requires_setup = any( - hasattr(filter_obj, "load_model") or hasattr(filter_obj, "load_tokenizer") - for filter_obj in self.filter_obj - if isinstance(filter_obj, DocumentFilter) - ) - return {"is_actor_stage": requires_setup} - - def setup_on_node( - self, - _node_info: NodeInfo | None = None, - _worker_metadata: WorkerMetadata | None = None, - ) -> None: - for filter_obj in self.filter_obj: - if isinstance(filter_obj, DocumentFilter) and hasattr(filter_obj, "model_check_or_download"): - filter_obj.model_check_or_download() - - def setup(self, _: WorkerMetadata | None = None) -> None: - for filter_obj in self.filter_obj: - if isinstance(filter_obj, DocumentFilter): - if hasattr(filter_obj, "load_model"): - filter_obj.load_model() - if hasattr(filter_obj, "load_tokenizer"): - filter_obj.load_tokenizer() - - def compute_filter_mask( - self, df: pd.DataFrame, filter_obj: DocumentFilter, text_field: str, score_field: str | None, invert: bool - ) -> pd.Series: - """Compute the bool mask to filter the dataset. - - Args: - df (pd.DataFrame): The dataset to compute filter mask on. - filter_obj (DocumentFilter): The filter object to use. - text_field (str): The field to read the text from. - score_field (str | None): The field to write the scores to. - invert (bool): Whether to invert the filter condition. - - Returns: - Series: A mask corresponding to each data instance indicating whether it will be retained. - - """ - - scores = df[text_field].apply(filter_obj.score_document) - - if score_field is not None: - df[score_field] = scores - - bool_mask = scores.apply(filter_obj.keep_document) - - if invert: - bool_mask = ~bool_mask - - return bool_mask - - def process(self, batch: DocumentBatch) -> DocumentBatch | None: - """ - Scores and filters all records in the dataset - - Args: - batch (DocumentBatch): The batch to apply the module to - - Returns: - DocumentBatch: A batch with the score and filter applied - - """ - df = batch.to_pandas() - - if df.empty: - logger.info(f"Empty dataset for batch {batch.task_id}") - return batch - - for filter_obj_i, text_field_i, score_field_i, invert_i in zip( - self.filter_obj, self.text_field, self.score_field, self.invert, strict=True - ): - bool_mask = self.compute_filter_mask(df, filter_obj_i, text_field_i, score_field_i, invert_i) - df = df[bool_mask] - - if len(df) == 0: - logger.info(f"All documents filtered out for batch {batch.task_id}") - - # Create output batch - return DocumentBatch( - task_id=f"{batch.task_id}_{self.name}", - dataset_name=batch.dataset_name, - data=df, - _metadata=batch._metadata, - _stage_perf=batch._stage_perf, - ) - - -def _filter_name(x: DocumentFilter | Callable) -> str: - return x.name if isinstance(x, DocumentFilter) else x.__name__ - - -def _get_filter_stage_name(filters: list[DocumentFilter | Callable], prefix: str) -> str: - """ - Derive the stage name from the provided score/filter functions. - - """ - return ( - _filter_name(filters[0]) - if len(filters) == 1 - else f"{prefix}_chain_of_" + "_".join(_filter_name(f) for f in filters) - ) - - -def _format_single_field_list( - _field: str | list[str] | None, field_name: str, field_type: type = str -) -> list[str] | list[bool]: - """ - In the case of a single DocumentFilter or Callable, format the relevant field - (filter_field, score_field, text_field, invert) to a list of length 1. - - Args: - _field (str | list[str] | None): The field to check and format. - field_name (str): The name of the field, which is used in error messages. - field_type (type): The type of the field, which is used in an isinstance check. - - Returns: - list[str] | list[bool]: The reformatted field. - - """ - if isinstance(_field, list): - if len(_field) > 1: - msg = f"More {field_name} fields than functions provided: {_field}" - raise ValueError(msg) - elif isinstance(_field, field_type): - _field = [_field] - else: - msg = f"{field_name} field must be a {field_type} or list of {field_type}: {_field}" - raise TypeError(msg) - - return _field - - -def _format_field_list( - _field: str | list[str] | None, filter_count: int, field_name: str, field_type: type = str -) -> list[str] | list[bool]: - """ - In the case of a list of DocumentFilters or Callables, format the relevant field - (filter_field, score_field, text_field, invert) to a list of length equal to the number of filters. - - Args: - _field (str | list[str] | None): The field to check and format. - filter_count (int): The number of filters. This will be the length of the output list. - field_name (str): The name of the field, which is used in error messages. - field_type (type): The type of the field, which is used in an isinstance check. - - Returns: - list[str] | list[bool]: The reformatted field. - - """ - if isinstance(_field, list): - if len(_field) == 1: - logger.info(f"Using the same {field_name} field for all functions: {_field}") - _field = [_field] * filter_count - if len(_field) != filter_count: - msg = f"Number of {field_name} fields must match number of functions: {_field}" - raise ValueError(msg) - elif isinstance(_field, field_type): - logger.info(f"Using the same {field_name} field for all functions: {_field}") - _field = [_field] * filter_count - else: - msg = f"{field_name} field must be a {field_type} or list of {field_type}: {_field}" - raise TypeError(msg) - - return _field - - -def _validate_and_normalize_filters( # noqa: C901, PLR0912 - _filter: DocumentFilter | Callable | list[DocumentFilter | Callable], - input_field: str | list[str] | None, - invert: bool | list[bool] | None, - output_field: str | list[str] | None, - fn_type: Literal["score", "filter", "score_filter"], -) -> tuple[str, list[DocumentFilter | Callable], list[str] | None, list[bool] | None, list[str] | None]: - """ - Validate and normalize all parameters needed for the Score, Filter, and ScoreFilter modules. - "Normalize" means to reformat all parameters to a list of length equal to the number of filters. - - Args: - _filter (DocumentFilter | Callable | list[DocumentFilter | Callable]): The filter object or list of filter objects. - input_field (str | list[str] | None): The input field. For Score and ScoreFilter, this is the text field. For Filter, this is the filter field. - invert (bool | list[bool] | None): The invert flag. This is used for Filter and ScoreFilter. - output_field (str | list[str] | None): The output field. For Score and ScoreFilter, this is the score field. For Filter, this is not used. - fn_type (Literal["score", "filter", "score_filter"]): The type of the module. - - Returns: - tuple[str, list[DocumentFilter | Callable], list[str] | None, list[bool] | None, list[str] | None]: - The first string returned corresponds to the name given to the DocumentFilter or Callable. - The normalized filters, input fields, invert flags, and output fields make up the rest of the tuple. - - """ - - # For Score and ScoreFilter, the input_field is the text field - # For Filter, the input_field is the filter field - input_field_name = "filter" if fn_type == "filter" else "text" - if input_field is None: - msg = f"{input_field_name}_field cannot be None" - raise ValueError(msg) - - # Score is the only module that explicitly requires an output field, - # i.e., a score_field that is calculated by the DocumentFilter or Callable. - if output_field is None and fn_type == "score": - msg = "score_field cannot be None" - raise ValueError(msg) - - if isinstance(_filter, DocumentFilter): - _name = _filter.name - elif isinstance(_filter, Callable): - _name = f"{fn_type}_fn" - - if isinstance(_filter, (DocumentFilter, Callable)): - _normalized_filter = [_filter] - _input_field = _format_single_field_list(input_field, input_field_name, field_type=str) - - if fn_type in ["filter", "score_filter"]: - _invert = _format_single_field_list(invert, "invert", field_type=bool) - else: - # Score does not use an invert flag - _invert = None - - if fn_type in ["score", "score_filter"]: - # ScoreFilter is allowed to have no output fields, but Score is not - if output_field is None and fn_type == "score_filter": - _output_field = [None] - else: - _output_field = _format_single_field_list(output_field, "score", field_type=str) - else: - # Filter does not use an output field - _output_field = None - - elif isinstance(_filter, list): - _name = _get_filter_stage_name(_filter, prefix=fn_type) - _normalized_filter = _filter - - # Technically, you could run a list of filters on the same filter_field. - # However, prefer to use a list of fields to avoid confusion. - if fn_type == "filter" and ( - isinstance(input_field, str) or (isinstance(input_field, list) and len(input_field) == 1) - ): - msg = f"filter_field must be a list of strings if multiple filters are used: {input_field}" - raise ValueError(msg) - - _input_field = _format_field_list(input_field, len(_filter), input_field_name, field_type=str) - - if fn_type in ["filter", "score_filter"]: - _invert = _format_field_list(invert, len(_filter), "invert", field_type=bool) - else: - # Score does not use an invert flag - _invert = None - - if fn_type in ["score", "score_filter"]: - # ScoreFilter is allowed to have no output fields, but Score is not - if output_field is None and fn_type == "score_filter": - _output_field = [None] * len(_filter) - # Output fields are always required to be a (unique) list of strings. - # We check that here. - elif isinstance(output_field, str) or (isinstance(output_field, list) and len(output_field) == 1): - msg = f"score_field must be a list of strings if multiple filters are used: {output_field}" - raise ValueError(msg) - else: - _output_field = _format_field_list(output_field, len(_filter), "score", field_type=str) - else: - # Filter does not use an output field - _output_field = None - - return _name, _normalized_filter, _input_field, _invert, _output_field - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/classifiers/base.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from dataclasses import dataclass -from typing import Literal - -os.environ["RAPIDS_NO_INITIALIZE"] = "1" - -import numpy as np -import pandas as pd -import torch -from huggingface_hub import PyTorchModelHubMixin -from torch import nn -from transformers import AutoConfig, AutoModel - -from nemo_curator.stages.base import CompositeStage, ProcessingStage -from nemo_curator.stages.text.filters import Filter -from nemo_curator.stages.text.models.model import ModelStage -from nemo_curator.stages.text.models.tokenizer import TokenizerStage -from nemo_curator.stages.text.models.utils import ATTENTION_MASK_FIELD, INPUT_ID_FIELD -from nemo_curator.tasks import DocumentBatch - -from .utils import SortByLengthStage - - -class Deberta(nn.Module, PyTorchModelHubMixin): - """ - Base PyTorch model where we add a classification head. - - Args: - config: The configuration of the model. - - """ - - def __init__(self, config: dataclass): - super().__init__() - self.model = AutoModel.from_pretrained(config["base_model"]) - self.dropout = nn.Dropout(config["fc_dropout"]) - self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"])) - - @property - def device(self) -> torch.device: - return next(self.parameters()).device - - @torch.no_grad() - def forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor: - features = self.model(batch[INPUT_ID_FIELD], batch[ATTENTION_MASK_FIELD]).last_hidden_state - dropped = self.dropout(features) - outputs = self.fc(dropped) - - del batch, features, dropped - - return torch.softmax(outputs[:, 0, :], dim=1) - - -class ClassifierModelStage(ModelStage): - """ - Stage for Hugging Face model inference. - - Args: - model_identifier: The identifier of the Hugging Face model. - label_field: The name of the prediction column. - score_field: The name of the probability column. Defaults to None. - model_inference_batch_size: The size of the batch for model inference. Defaults to 256. - has_seq_order: Whether to sort the input data by the length of the input tokens. - Sorting is encouraged to improve the performance of the inference model. Defaults to True. - padding_side: The side to pad the input tokens. Defaults to "right". - max_seq_length: If provided, clips the input tokens before the forward pass. Defaults to None. - autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference. - Defaults to True. - keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False. - - """ - - def __init__( # noqa: PLR0913 - self, - model_identifier: str, - cache_dir: str | None = None, - label_field: str = "preds", - score_field: str | None = None, - model_inference_batch_size: int = 256, - has_seq_order: bool = True, - padding_side: Literal["left", "right"] = "right", - max_seq_length: int | None = None, - autocast: bool = True, - keep_tokens: bool = False, - ): - super().__init__( - model_identifier=model_identifier, - cache_dir=cache_dir, - has_seq_order=has_seq_order, - model_inference_batch_size=model_inference_batch_size, - padding_side=padding_side, - max_seq_length=max_seq_length, - unpack_inference_batch=False, - autocast=autocast, - ) - - self.label_field = label_field - if score_field is not None: - self.score_field = score_field - self.keep_score_field = True - else: - self.score_field = "probs" - self.keep_score_field = False - - self.keep_tokens = keep_tokens - - def outputs(self) -> tuple[list[str], list[str]]: - return ["data"], [self.label_field] + ([self.score_field] if self.keep_score_field else []) - - def _setup(self, local_files_only: bool = True) -> None: - self.model = ( - Deberta.from_pretrained(self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only) - .cuda() - .eval() - ) - - config = AutoConfig.from_pretrained( - self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only - ) - self.labels = list(config.label2id.keys()) - self.labels.sort(key=lambda x: config.label2id[x]) - - def process_model_output( - self, outputs: torch.Tensor, _: dict[str, torch.Tensor] | None = None - ) -> dict[str, np.ndarray]: - probs = outputs.cpu().numpy() - preds = np.argmax(probs, axis=1) - - pred_labels = [self.labels[idx] for idx in preds] - - return { - self.score_field: probs, - self.label_field: np.array(pred_labels), - } - - def create_output_dataframe(self, df_cpu: pd.DataFrame, collected_output: dict[str, np.ndarray]) -> pd.DataFrame: - if not self.keep_tokens: - df_cpu = df_cpu.drop(columns=[INPUT_ID_FIELD, ATTENTION_MASK_FIELD]) - - df_cpu[self.label_field] = collected_output[self.label_field] - - if self.keep_score_field: - df_cpu[self.score_field] = collected_output[self.score_field].tolist() - - return df_cpu - - -@dataclass(kw_only=True) -class DistributedDataClassifier(CompositeStage[DocumentBatch, DocumentBatch]): - """ - Base composite stage for distributed data classification. - - It decomposes into a tokenizer stage and a model stage. - - Args: - model_identifier: The identifier of the Hugging Face model. - cache_dir: The Hugging Face cache directory. Defaults to None. - label_field: The name of the prediction column. Defaults to "preds". - score_field: The name of the probability column. Defaults to None. - text_field: The name of the text field in the input data. Defaults to "text". - filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None. - max_chars: Limits the total number of characters that can be fed to the tokenizer. - If None, text will not be truncated. Defaults to None. - max_seq_length: Limits the total sequence returned by the tokenizer so that it has a maximum length. - If None, the tokenizer's model_max_length is used. Defaults to 512. - padding_side: The side to pad the input tokens. Defaults to "right". - sort_by_length: Whether to sort the input data by the length of the input tokens. - Sorting is encouraged to improve the performance of the inference model. Defaults to True. - model_inference_batch_size: The size of the batch for model inference. Defaults to 256. - autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference. - Defaults to True. - keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False. - use_existing_tokens: Whether to use the existing tokens from the input dataframe. - If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization. - Defaults to False. - - """ - - model_identifier: str - cache_dir: str | None = None - label_field: str = "preds" - score_field: str | None = None - text_field: str = "text" - filter_by: list[str] | None = None - max_chars: int | None = None - max_seq_length: int | None = None - padding_side: Literal["left", "right"] = "right" - sort_by_length: bool = True - model_inference_batch_size: int = 256 - autocast: bool = True - keep_tokens: bool = False - use_existing_tokens: bool = False - - def __post_init__(self) -> None: - super().__init__() - - self.stages = [] - - if not self.use_existing_tokens: - tokenizer_stage = TokenizerStage( - model_identifier=self.model_identifier, - cache_dir=self.cache_dir, - text_field=self.text_field, - max_chars=self.max_chars, - max_seq_length=self.max_seq_length, - padding_side=self.padding_side, - sort_by_length=self.sort_by_length, - ) - self.stages.append(tokenizer_stage) - # The TokenizerStage already truncates to the max_seq_length, so the ModelStage does not need to do it again - model_max_seq_length = None - else: - # The ModelStage will truncate to the max_seq_length before the forward pass - model_max_seq_length = self.max_seq_length - - # Ensure that the data is sorted by length if the tokens are already present and sort_by_length is True - if self.use_existing_tokens and self.sort_by_length: - sort_by_length_stage = SortByLengthStage() - self.stages.append(sort_by_length_stage) - - model_stage = ClassifierModelStage( - model_identifier=self.model_identifier, - cache_dir=self.cache_dir, - label_field=self.label_field, - score_field=self.score_field, - model_inference_batch_size=self.model_inference_batch_size, - has_seq_order=self.sort_by_length, - padding_side=self.padding_side, - max_seq_length=model_max_seq_length, - autocast=self.autocast, - keep_tokens=self.keep_tokens, - ) - self.stages.append(model_stage) - - if self.filter_by is not None and len(self.filter_by) > 0: - self.stages.append(Filter(filter_fn=self.filter_by_category, filter_field=self.label_field)) - - def inputs(self) -> tuple[list[str], list[str]]: - return self.stages[0].inputs() - - def outputs(self) -> tuple[list[str], list[str]]: - return self.stages[-1].outputs() - - def filter_by_category(self, value: str) -> bool: - return value in self.filter_by - - def decompose(self) -> list[ProcessingStage]: - return self.stages - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/classifiers/domain.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -os.environ["RAPIDS_NO_INITIALIZE"] = "1" - -from nemo_curator.stages.text.models.utils import format_name_with_suffix - -from .base import DistributedDataClassifier -from .utils import DEBERTA_TOKENIZER_PADDING_SIDE - -DOMAIN_MODEL_IDENTIFIER = "nvidia/domain-classifier" -MULTILINGUAL_DOMAIN_MODEL_IDENTIFIER = "nvidia/multilingual-domain-classifier" -MAX_SEQ_LENGTH = 512 - - -class DomainClassifier(DistributedDataClassifier): - """ - DomainClassifier is a specialized classifier designed for English text domain classification tasks, - utilizing the NemoCurator Domain Classifier (https://huggingface.co/nvidia/domain-classifier) model. - This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets. - - Attributes: - cache_dir: The Hugging Face cache directory. Defaults to None. - label_field: The name of the prediction column. Defaults to "domain_pred". - score_field: The name of the probability column. Defaults to None. - text_field: The name of the text field in the input data. Defaults to "text". - filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None. - max_chars: The maximum number of characters to use from the input text. Defaults to 2000. - sort_by_length: Whether to sort the input data by the length of the input tokens. - Sorting is encouraged to improve the performance of the inference model. Defaults to True. - model_inference_batch_size: The size of the batch for model inference. Defaults to 256. - autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference. - Defaults to True. - keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False. - use_existing_tokens: Whether to use the existing tokens from the input dataframe. - If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization. - Defaults to False. - - """ - - def __init__( # noqa: PLR0913 - self, - cache_dir: str | None = None, - label_field: str = "domain_pred", - score_field: str | None = None, - text_field: str = "text", - filter_by: list[str] | None = None, - max_chars: int = 2000, - sort_by_length: bool = True, - model_inference_batch_size: int = 256, - autocast: bool = True, - keep_tokens: bool = False, - use_existing_tokens: bool = False, - ): - super().__init__( - model_identifier=DOMAIN_MODEL_IDENTIFIER, - cache_dir=cache_dir, - label_field=label_field, - score_field=score_field, - text_field=text_field, - filter_by=filter_by, - max_chars=max_chars, - max_seq_length=MAX_SEQ_LENGTH, - padding_side=DEBERTA_TOKENIZER_PADDING_SIDE, - sort_by_length=sort_by_length, - model_inference_batch_size=model_inference_batch_size, - autocast=autocast, - keep_tokens=keep_tokens, - use_existing_tokens=use_existing_tokens, - ) - - self.name = format_name_with_suffix(DOMAIN_MODEL_IDENTIFIER) - - -class MultilingualDomainClassifier(DistributedDataClassifier): - """ - MultilingualDomainClassifier is a specialized classifier designed for domain classification tasks, - utilizing the NemoCurator Multilingual Domain Classifier (https://huggingface.co/nvidia/multilingual-domain-classifier) model. - It supports domain classification across 52 languages. - This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets. - - Attributes: - cache_dir: The Hugging Face cache directory. Defaults to None. - label_field: The name of the prediction column. Defaults to "multilingual_domain_pred". - score_field: The name of the probability column. Defaults to None. - text_field: The name of the text field in the input data. Defaults to "text". - filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None. - max_chars: The maximum number of characters to use from the input text. Defaults to 2000. - sort_by_length: Whether to sort the input data by the length of the input tokens. - Sorting is encouraged to improve the performance of the inference model. Defaults to True. - model_inference_batch_size: The size of the batch for model inference. Defaults to 256. - autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference. - Defaults to True. - keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False. - use_existing_tokens: Whether to use the existing tokens from the input dataframe. - If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization. - Defaults to False. - - """ - - def __init__( # noqa: PLR0913 - self, - cache_dir: str | None = None, - label_field: str = "multilingual_domain_pred", - score_field: str | None = None, - text_field: str = "text", - filter_by: list[str] | None = None, - max_chars: int = 2000, - sort_by_length: bool = True, - model_inference_batch_size: int = 256, - autocast: bool = True, - keep_tokens: bool = False, - use_existing_tokens: bool = False, - ): - super().__init__( - model_identifier=MULTILINGUAL_DOMAIN_MODEL_IDENTIFIER, - cache_dir=cache_dir, - label_field=label_field, - score_field=score_field, - text_field=text_field, - filter_by=filter_by, - max_chars=max_chars, - max_seq_length=MAX_SEQ_LENGTH, - padding_side=DEBERTA_TOKENIZER_PADDING_SIDE, - sort_by_length=sort_by_length, - model_inference_batch_size=model_inference_batch_size, - autocast=autocast, - keep_tokens=keep_tokens, - use_existing_tokens=use_existing_tokens, - ) - - self.name = format_name_with_suffix(MULTILINGUAL_DOMAIN_MODEL_IDENTIFIER) - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/classifiers/quality.py -```py -# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -os.environ["RAPIDS_NO_INITIALIZE"] = "1" - -from nemo_curator.stages.text.models.utils import format_name_with_suffix - -from .base import DistributedDataClassifier -from .utils import DEBERTA_TOKENIZER_PADDING_SIDE - -QUALITY_CLASSIFIER_MODEL_IDENTIFIER = "nvidia/quality-classifier-deberta" -MAX_SEQ_LENGTH = 1024 - - -class QualityClassifier(DistributedDataClassifier): - """ - QualityClassifier is a specialized classifier designed for quality assessment tasks, - utilizing the NemoCurator Quality Classifier DeBERTa model (https://huggingface.co/nvidia/quality-classifier-deberta). - This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets. - - Attributes: - cache_dir: The Hugging Face cache directory. Defaults to None. - label_field: The name of the prediction column. Defaults to "quality_pred". - score_field: The name of the probability column. Defaults to None. - text_field: The name of the text field in the input data. Defaults to "text". - filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None. - max_chars: Limits the total number of characters that can be fed to the tokenizer. - If None, text will not be truncated. Defaults to 6000. - sort_by_length: Whether to sort the input data by the length of the input tokens. - Sorting is encouraged to improve the performance of the inference model. Defaults to True. - model_inference_batch_size: The size of the batch for model inference. Defaults to 256. - autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference. - Defaults to True. - keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False. - use_existing_tokens: Whether to use the existing tokens from the input dataframe. - If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization. - Defaults to False. - - """ - - def __init__( # noqa: PLR0913 - self, - cache_dir: str | None = None, - label_field: str = "quality_pred", - score_field: str | None = None, - text_field: str = "text", - filter_by: list[str] | None = None, - max_chars: int = 6000, - sort_by_length: bool = True, - model_inference_batch_size: int = 256, - autocast: bool = True, - keep_tokens: bool = False, - use_existing_tokens: bool = False, - ): - super().__init__( - model_identifier=QUALITY_CLASSIFIER_MODEL_IDENTIFIER, - cache_dir=cache_dir, - label_field=label_field, - score_field=score_field, - text_field=text_field, - filter_by=filter_by, - max_chars=max_chars, - max_seq_length=MAX_SEQ_LENGTH, - padding_side=DEBERTA_TOKENIZER_PADDING_SIDE, - sort_by_length=sort_by_length, - model_inference_batch_size=model_inference_batch_size, - autocast=autocast, - keep_tokens=keep_tokens, - use_existing_tokens=use_existing_tokens, - ) - - self.name = format_name_with_suffix(QUALITY_CLASSIFIER_MODEL_IDENTIFIER) - -``` - -File: /Users/mromeijn/src/Curator/nemo_curator/config/text/heuristic_filter_english_pipeline.yaml -```yaml -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -defaults: - - _self_ - - override hydra/job_logging: none - - override hydra/hydra_logging: none - -hydra: - run: - dir: . - output_subdir: null - -documentation: | - NeMo Curator Pipeline English Heuristic Filter Configuration File - ################################################################# - This configuration file can be used to build a NeMo Curator pipeline that filters English text. - This example reads the input files, runs the heuristic filters, and saves the results. - - The filters below define a chain of heuristic filters to be applied to each document in a corpus. - This particular cascade of filters is intended to filter English language data. - The filter listed at the top will be applied first, and the following filters will be applied in - the order they appear in this file. Each filter can be removed and re-ordered as desired. - - To customize your own pipeline, you can add or remove stages from the stages list, - where _target_ is the stage class and includes all the parameters for the stage. - -input_path: ??? -output_path: ??? -text_field: text - -stages: - - _target_: nemo_curator.stages.text.io.reader.JsonlReader - file_paths: ${input_path} - files_per_partition: null - blocksize: null - fields: null - - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.NonAlphaNumericFilter - max_non_alpha_numeric_to_text_ratio: 0.25 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.SymbolsToWordsFilter - max_symbol_to_word_ratio: 0.1 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.NumbersFilter - max_number_to_text_ratio: 0.15 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.UrlsFilter - max_url_to_text_ratio: 0.2 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.WhiteSpaceFilter - max_white_space_ratio: 0.25 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.ParenthesesFilter - max_parentheses_ratio: 0.1 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.BoilerPlateStringFilter - remove_if_at_top_or_bottom: True - max_boilerplate_string_ratio: 0.4 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedLinesFilter - max_repeated_line_fraction: 0.7 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedParagraphsFilter - max_repeated_paragraphs_ratio: 0.7 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedLinesByCharFilter - max_repeated_lines_char_ratio: 0.8 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedParagraphsByCharFilter - max_repeated_paragraphs_char_ratio: 0.8 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.WordCountFilter - min_words: 50 - max_words: 100000 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.PunctuationFilter - max_num_sentences_without_endmark_ratio: 0.85 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.WordsWithoutAlphabetsFilter - min_words_with_alphabets: 0.8 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.CommonEnglishWordsFilter - min_num_common_words: 2 - stop_at_false: True - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.MeanWordLengthFilter - max_mean_word_length: 10 - min_mean_word_length: 3 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.LongWordFilter - max_word_length: 1000 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.EllipsisFilter - max_num_lines_ending_with_ellipsis_ratio: 0.3 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Top N-Gram filters for N-gram 2 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingTopNGramsFilter - n: 2 - max_repeating_ngram_ratio: 0.2 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Top N-Gram filters for N-gram 3 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingTopNGramsFilter - n: 3 - max_repeating_ngram_ratio: 0.18 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Top N-Gram filters for N-gram 4 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingTopNGramsFilter - n: 4 - max_repeating_ngram_ratio: 0.16 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Duplicate N-gram filters for N-gram 5 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter - n: 5 - max_repeating_duplicate_ngram_ratio: 0.15 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Duplicate N-gram filters for N-gram 6 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter - n: 6 - max_repeating_duplicate_ngram_ratio: 0.14 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Duplicate N-gram filters for N-gram 7 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter - n: 7 - max_repeating_duplicate_ngram_ratio: 0.13 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Duplicate N-gram filters for N-gram 8 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter - n: 8 - max_repeating_duplicate_ngram_ratio: 0.12 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Duplicate N-gram filters for N-gram 9 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter - n: 9 - max_repeating_duplicate_ngram_ratio: 0.11 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - # Duplicate N-gram filters for N-gram 10 - _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter - n: 10 - max_repeating_duplicate_ngram_ratio: 0.10 - text_field: ${text_field} - score_field: null - - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter - filter_obj: - _target_: nemo_curator.stages.text.filters.heuristic.string.BulletsFilter - max_bullet_lines_ratio: 0.9 - text_field: ${text_field} - score_field: null - - - _target_: nemo_curator.stages.text.io.writer.JsonlWriter - path: ${output_path} - fields: null - -``` - -File: /Users/mromeijn/src/Curator/tutorials/text/distributed-data-classification/README.md -```md -# Distributed Data Classification - -The following is a set of Jupyter notebook tutorials which demonstrate how to use various text classification models supported by NeMo Curator. -The goal of using these classifiers is to help with data annotation, which is useful in data blending for foundation model training. - -Each of these classifiers are available on Hugging Face and can be run independently with the [Transformers](https://github.com/huggingface/transformers) library. -By running them with NeMo Curator, the classifiers are accelerated using a heterogenous pipeline setup where tokenization is run across CPUs and model inference is run across GPUs. -Each of the Jupyter notebooks in this directory demonstrate how to run the classifiers on text data and are easily scalable to large amounts of data. - -Before running any of these notebooks, see this [Installation Guide](https://docs.nvidia.com/nemo/curator/latest/admin/installation.html#admin-installation) page for instructions on how to install NeMo Curator. Be sure to use an installation method which includes GPU dependencies. - -For more information about the classifiers, refer to our [Distributed Data Classification](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/quality-assessment/distributed-classifier.html) documentation page. - -## List of Classifiers - -
- -| NeMo Curator Classifier | Description | Hugging Face Page | -| --- | --- | --- | -| `AegisClassifier` | Identify and categorize unsafe content per document | [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0) | -| `ContentTypeClassifier` | Categorize the type-of-speech per document | [nvidia/content-type-classifier-deberta](https://huggingface.co/nvidia/content-type-classifier-deberta) | -| `DomainClassifier` | Categorize the domain per document | [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier) | -| `FineWebEduClassifier` | Determine the educational value per document; this model was trained using annotations from Llama 3 70B-Instruct | [HuggingFaceFW/fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) | -| `FineWebMixtralEduClassifier` | Determine the educational value per document; this model was trained using annotations from Mixtral 8x22B-Instruct | [nvidia/nemocurator-fineweb-mixtral-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier) | -| `FineWebNemotronEduClassifier` | Determine the educational value per document; this model was trained using annotations from Nemotron-4-340B-Instruct | [nvidia/nemocurator-fineweb-nemotron-4-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier) | -| `InstructionDataGuardClassifier` | Identify LLM poisoning attacks per document | [nvidia/instruction-data-guard](https://huggingface.co/nvidia/instruction-data-guard) | -| `MultilingualDomainClassifier` | Categorize the domain per document; supports classification in 52 languages | [nvidia/multilingual-domain-classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) | -| `PromptTaskComplexityClassifier` | Classifies text prompts across task types and complexity dimensions | [nvidia/prompt-task-and-complexity-classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) | -| `QualityClassifier` | Categorize documents as high, medium, or low quality | [quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) | - -
- -Note that all classifiers support English text classification only, except the `MultilingualDomainClassifier`. - -## Bring Your Own Classifier - -Advanced users may want to integrate their own Hugging Face classifier(s) into NeMo Curator. Broadly, this requires creating a `CompositeStage` consisting of a CPU-based tokenizer stage and a GPU-based model inference stage. Refer to the [Text Classifiers README](https://github.com/NVIDIA-NeMo/Curator/tree/main/nemo_curator/stages/text/classifiers#text-classifiers) for details about how to do this. - -``` -
diff --git a/skills/nemotron-customize/context/eval-deploy-formats.txt b/skills/nemotron-customize/context/eval-deploy-formats.txt deleted file mode 100644 index a417d966a..000000000 --- a/skills/nemotron-customize/context/eval-deploy-formats.txt +++ /dev/null @@ -1,743 +0,0 @@ - -/Users/mromeijn/src/Evaluator -├── docs -│ └── deployment -│ ├── launcher-orchestrated -│ │ └── index.md * -│ └── nemo-fw -│ ├── hf.md * -│ ├── index.md * -│ └── mbridge.md * -└── packages - └── nemo-evaluator-launcher - ├── examples - │ ├── local_nim.yaml * - │ ├── local_vllm_logprobs.yaml * - │ └── slurm_nim.yaml * - └── src - └── nemo_evaluator_launcher - ├── configs - │ ├── deployment - │ │ ├── nim.yaml * - │ │ └── vllm.yaml * - │ └── default.yaml * - └── resources - └── config_templates - └── deployment - ├── nim.yaml * - └── vllm.yaml * - - -File: /Users/mromeijn/src/Evaluator/docs/deployment/nemo-fw/index.md -```md -(deployment-nemo-fw)= -# Deploy and Evaluate Checkpoints Trained by NeMo Framework - -The NeMo Framework is NVIDIA’s GPU-accelerated, end-to-end training platform for large language models (LLMs), multimodal models, and speech models. It enables seamless scaling of both pretraining and post-training workloads, from a single GPU to clusters with thousands of nodes, supporting Hugging Face/PyTorch and Megatron models. NeMo includes a suite of libraries and curated training recipes to help users build models from start to finish. - -The NeMo Evaluator is integrated within NeMo Framework, offering streamlined deployment and advanced evaluation capabilities for models trained using NeMo, leveraging state-of-the-art evaluation harnesses. - -## Features - -- **Multi-Backend Deployment**: Supports PyTriton and multi-instance evaluations using the Ray Serve deployment backend -- **Production-Ready**: Supports high-performance inference with CUDA graphs and flash decoding for Megatron models, vLLM backend for Hugging Face models and TRTLLM engine for TRTLLM models -- **Multi-GPU and Multi-Node Support**: Enables distributed inference across multiple GPUs and compute nodes -- **OpenAI-Compatible API**: Provides RESTful endpoints aligned with OpenAI API specifications - -## Architecture - -### 1. Deployment Layer - -- **PyTriton Backend**: Provides high-performance inference through the NVIDIA Triton Inference Server, with OpenAI API compatibility via a FastAPI interface. Supports model parallelism across single-node and multi-node configurations. Note: Multi-instance evaluation is not supported. -- **Ray Backend**: Enables multi-instance evaluation with model parallelism on a single node using Ray Serve, while maintaining OpenAI API compatibility. Multi-node support is coming soon. - -For more information on the deployment, please see [NeMo Export-Deploy](https://github.com/NVIDIA-NeMo/Export-Deploy). - -### 2. Evaluation Layer - -- **NeMo Evaluator**: Provides standardized benchmark evaluations using packages from NVIDIA Eval Factory, bundled in the NeMo Framework container. The `lm-evaluation-harness` is pre-installed by default, and additional evaluation packages can be added as needed. For more information, see {ref}`core-wheels` and {ref}`lib-core`. - - - -```{toctree} -:maxdepth: 1 -:hidden: - -Introduction -PyTriton Serving Backend -Ray Serving Backend -Evaluate Megatron Bridge Checkpoints -Evaluate Automodel Checkpoints -Evaluate TRTLLM Checkpoints -``` -``` - -File: /Users/mromeijn/src/Evaluator/docs/deployment/nemo-fw/hf.md -```md -# Evaluate Automodel Checkpoints Trained by NeMo Framework - -This guide provides step-by-step instructions for evaluating checkpoints trained using the NeMo Framework with the Automodel backend. This section specifically covers evaluation with [nvidia-lm-eval](https://pypi.org/project/nvidia-lm-eval/), a wrapper around the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) tool. - -Here, we focus on benchmarks within the `lm-evaluation-harness` that depend on text generation. Evaluation on log-probability-based benchmarks is available in [Evaluate Automodel Checkpoints on Log-probability benchmarks](#evaluate-automodel-checkpoints-on-log-probability-benchmarks). - -## Deploy Automodel Checkpoints - -This section outlines the steps to deploy Automodel checkpoints using Python commands. - -Automodel checkpoint deployment uses Ray Serve as the serving backend. It also offers an OpenAI API (OAI)-compatible endpoint, similar to deployments of checkpoints trained with the Megatron Core backend. An example deployment command is shown below. - -```{literalinclude} _snippets/deploy_hf.sh -:language: bash -:start-after: "# [snippet-start]" -:end-before: "# [snippet-end]" -``` - -The `--model_path` can refer to either a local checkpoint path or a Hugging Face model ID, as shown in the example above. In the example above, checkpoint deployment uses the `vLLM` backend. To enable accelerated inference, install `vLLM` in your environment. To install `vLLM` inside the NeMo Framework container, follow the steps below as shared in [Export-Deploy's README](https://github.com/NVIDIA-NeMo/Export-Deploy?tab=readme-ov-file#install-tensorrt-llm-vllm-or-trt-onnx-backend:~:text=cd%20/opt/export%2ddeploy%0auv%20sync%20%2d%2dinexact%20%2d%2dlink%2dmode%20symlink%20%2d%2dlocked%20%2d%2dextra%20vllm%20%24(cat%20/opt/uv_args.txt)): - -```shell -cd /opt/Export-Deploy -uv sync --inexact --link-mode symlink --locked --extra vllm $(cat /opt/uv_args.txt) -``` - -To install `vLLM` outside of the NeMo Framework container, follow the steps mentioned [here](https://github.com/NVIDIA-NeMo/Export-Deploy?tab=readme-ov-file#install-tensorrt-llm-vllm-or-trt-onnx-backend:~:text=install%20transformerengine%20%2b%20vllm). - -:::{note} -25.11 release of NeMo Framework container comes with `vLLM` pre-installed and its not necessary to explicitly install it. However for all previous releases, please refer to the instructions above to install `vLLM` inside the NeMo Framework container. -::: - -If you prefer to evaluate the Automodel checkpoint without using the `vLLM` backend, remove the `--use_vllm_backend` flag from the command above. - -:::{note} -To speed up evaluation using multiple instances, increase the `num_replicas` parameter. -For additional guidance, refer to {ref}`nemo-fw-ray`. -::: - -## Evaluate Automodel Checkpoints - -This section outlines the steps to evaluate Automodel checkpoints using Python commands. This method is quick and easy, making it ideal for interactive evaluations. - -Once deployment is successful, you can run evaluations using the {ref}`lib-core` API. - -Before starting the evaluation, it’s recommended to use the [`check_endpoint`](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator/src/nemo_evaluator/core/utils.py) function to verify that the endpoint is responsive and ready to accept requests. - -```{literalinclude} _snippets/mmlu.py -:language: python -:start-after: "## Run the evaluation" -``` - -## Evaluate Automodel Checkpoints on Log-probability Benchmarks - -To evaluate Automodel checkpoints on benchmarks that require log-probabilities, use the same deployment command provided in [Deploy Automodel Checkpoints](#deploy-automodel-checkpoints). These benchmarks are supported by both the `vLLM` backend (enabled via the `--use_vllm_backend` flag) and by directly deploying the Automodel checkpoint. - -For evaluation, you must specify the path to the `tokenizer` and set the `tokenizer_backend` parameter as shown below. The `tokenizer` files are located within the checkpoint directory. - -```{literalinclude} _snippets/arc_challenge_hf.py -:language: python -:start-after: "## Run the evaluation" -``` - -## Evaluate Automodel Checkpoints on Chat Benchmarks - -To evaluate Automodel checkpoints on chat benchmarks you need the chat endpoint (`/v1/chat/completions/`). The deployment command provided in [Deploy Automodel Checkpoints](#deploy-automodel-checkpoints) also exposes the chat endpoint, and the same command can be used for evaluating on chat benchmarks. - -For evaluation, update the URL by replacing `/v1/completions/` with `/v1/chat/completions/` as shown below. Additionally, set the `type` field to `"chat"` to indicate a chat benchmark. - -```{literalinclude} _snippets/ifeval.py -:language: python -:start-after: "## Run the evaluation" -``` - -``` - -File: /Users/mromeijn/src/Evaluator/docs/deployment/nemo-fw/mbridge.md -```md -# Evaluate Megatron Bridge Checkpoints Trained by NeMo Framework - -This guide provides step-by-step instructions for evaluating [Megatron Bridge](https://docs.nvidia.com/nemo/megatron-bridge/latest/index.html) checkpoints trained using the NeMo Framework with the Megatron Core backend. This section specifically covers evaluation with [nvidia-lm-eval](https://pypi.org/project/nvidia-lm-eval/), a wrapper around the [ -lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) tool. - -First, we focus on benchmarks within the `lm-evaluation-harness` that depend on text generation. Evaluation on log-probability-based benchmarks is available in the subsequent section [Evaluate Megatron Bridge Checkpoints on Log-probability benchmarks](#evaluate-megatron-bridge-checkpoints-on-log-probability-benchmarks). - -## Deploy Megatron Bridge Checkpoints - -To evaluate a checkpoint saved during pretraining or fine-tuning with [Megatron-Bridge](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html), provide the path to the saved checkpoint using the `--megatron_checkpoint` flag in the deployment command below. Otherwise, Hugging Face checkpoints can be converted to Megatron Bridge using the single shell command: - -```bash -huggingface-cli login --token -python -c "from megatron.bridge import AutoBridge; AutoBridge.import_ckpt('meta-llama/Meta-Llama-3-8B','/workspace/mbridge_llama3_8b/')" -``` - -The deployment scripts are available inside the [`/opt/Export-Deploy/scripts/deploy/nlp/`](https://github.com/NVIDIA-NeMo/Export-Deploy/tree/main/scripts/deploy/nlp) directory. Below is an example command for deployment. It uses a Hugging Face LLaMA 3 8B checkpoint that has been converted to Megatron Bridge format using the command shared above. - -```{literalinclude} _snippets/deploy_mbridge.sh -:language: bash -:start-after: "# [snippet-start]" -:end-before: "# [snippet-end]" -``` - -:::{note} -Megatron Bridge creates checkpoints in directories named `iter_N`, where *N* is the iteration number. Each `iter_N` directory contains model weights and related artifacts. When using a checkpoint, make sure to provide the path to the appropriate `iter_N` directory. Hugging Face checkpoints converted for Megatron Bridge are typically stored in a directory named `iter_0000000`, as shown in the command above. -::: - -:::{note} -Megatron Bridge deployment for evaluation is supported only with Ray Serve and not PyTriton. -::: - -## Evaluate Megatron Bridge Checkpoints - -Once deployment is successful, you can run evaluations using the NeMo Evaluator API. See {ref}`lib-core` for more details. - -Before starting the evaluation, it’s recommended to use the [`check_endpoint`](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator/src/nemo_evaluator/core/utils.py) function to verify that the endpoint is responsive and ready to accept requests. - -```{literalinclude} _snippets/mmlu.py -:language: python -:start-after: "## Run the evaluation" -``` - -## Evaluate Megatron Bridge Checkpoints on Log-probability Benchmarks - -To evaluate Megatron Bridge checkpoints on benchmarks that require log-probabilities, use the same deployment command provided in [Deploy Megatron Bridge Checkpoints](#deploy-megatron-bridge-checkpoints). - -For evaluation, you must specify the path to the `tokenizer` and set the `tokenizer_backend` parameter as shown below. The `tokenizer` files are located within the `tokenizer` directory of the checkpoint. - -```{literalinclude} _snippets/arc_challenge_mbridge.py -:language: python -:start-after: "## Run the evaluation" -``` - -## Evaluate Megatron Bridge Checkpoints on Chat Benchmarks - -To evaluate Megatron Bridge checkpoints on chat benchmarks you need the chat endpoint (/v1/chat/completions/). The deployment command provided in [Deploy Megatron Bridge Checkpoints](#deploy-megatron-bridge-checkpoints) also exposes the chat endpoint, and the same command can be used for evaluating on chat benchmarks. - -For evaluation, update the URL by replacing `/v1/completions/` with `/v1/chat/completions/` as shown below. Additionally, set the `type` field to `"chat"` to indicate a chat benchmark. - -```{literalinclude} _snippets/ifeval.py -:language: python -:start-after: "## Run the evaluation" -``` - -``` - -File: /Users/mromeijn/src/Evaluator/docs/deployment/launcher-orchestrated/index.md -```md ---- -orphan: true ---- -(launcher-orchestrated-deployment)= - -# Launcher-Orchestrated Deployment - -Let NeMo Evaluator Launcher handle both model deployment and evaluation orchestration automatically. This is the recommended approach for most users, providing automated lifecycle management, multi-backend support, and integrated monitoring. - -## Overview - -Launcher-orchestrated deployment means the launcher: -- Deploys your model using the specified deployment type -- Manages the model serving lifecycle -- Runs evaluations against the deployed model -- Handles cleanup and resource management - -The launcher supports multiple deployment backends and execution environments. - -## Quick Start - -```bash -# Deploy model and run evaluation in one command (Slurm example) -HOSTNAME=cluster-login-node.com -ACCOUNT=my_account -OUT_DIR=/absolute/path/on/login/node - -nemo-evaluator-launcher run \ - -o execution.hostname=$HOSTNAME \ - -o execution.account=$ACCOUNT \ - -o execution.output_dir=$OUT_DIR \ - --config packages/nemo-evaluator-launcher/examples/slurm_vllm_basic.yaml -``` - -## Execution Backends - -Choose the execution backend that matches your infrastructure: - -::::{grid} 1 2 2 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`desktop-download;1.5em;sd-mr-1` Local Execution -:link: local -:link-type: doc -Run evaluations on your local machine against existing endpoints. **Note**: Local executor does **not** deploy models. Use Slurm or Lepton for deployment. -::: - -:::{grid-item-card} {octicon}`server;1.5em;sd-mr-1` Slurm Deployment -:link: slurm -:link-type: doc -Deploy on HPC clusters with Slurm workload manager. Ideal for large-scale evaluations with multi-node parallelism. -::: - -:::{grid-item-card} {octicon}`cloud;1.5em;sd-mr-1` Lepton Deployment -:link: lepton -:link-type: doc -Deploy on Lepton AI cloud platform. Best for cloud-native deployments with managed infrastructure and auto-scaling. -::: - -:::: - -## Deployment Types - -The launcher supports multiple deployment types: - -### vLLM Deployment -- **Fast inference** with optimized attention mechanisms -- **Continuous batching** for high throughput -- **Tensor parallelism** support for large models -- **Memory optimization** with configurable GPU utilization - -### NIM Deployment -- **Production-grade reliability** with enterprise features -- **NVIDIA optimized containers** for maximum performance -- **Built-in monitoring** and logging capabilities -- **Enterprise security** features - -### SGLang Deployment -- **Structured generation** support for complex tasks -- **Function calling** capabilities -- **JSON mode** for structured outputs -- **Efficient batching** for high throughput - -### No Deployment -- **Use existing endpoints** without launcher deployment -- **Bring-your-own-endpoint** integration -- **Flexible configuration** for any OpenAI-compatible API - -## Configuration Overview - -Basic configuration structure for launcher-orchestrated deployment: - -```yaml -# Use Hydra defaults to compose config -defaults: - - execution: slurm/default # or lepton/default; local does not deploy - - deployment: vllm # or nim, sglang, none - - _self_ - -# Deployment configuration -deployment: - checkpoint_path: /path/to/model # Or HuggingFace model ID - served_model_name: my-model - # ... deployment-specific options - -# Execution backend configuration -execution: - account: my-account - output_dir: /path/to/results - # ... backend-specific options - -# Evaluation tasks -evaluation: - tasks: - - name: mmlu_pro - - name: gsm8k -``` - -## Key Benefits - -### Automated Lifecycle Management -- **Deployment automation**: No manual setup required -- **Resource management**: Automatic allocation and cleanup -- **Error handling**: Built-in retry and recovery mechanisms -- **Monitoring integration**: Real-time status and logging - -### Multi-Backend Support -- **Consistent interface**: Same commands work across all backends -- **Environment flexibility**: Local development to production clusters -- **Resource optimization**: Backend-specific optimizations -- **Scalability**: From single GPU to multi-node deployments - -### Integrated Workflows -- **End-to-end automation**: From model to results in one command -- **Configuration management**: Version-controlled, reproducible configs -- **Result integration**: Built-in export and analysis tools -- **Monitoring and debugging**: Comprehensive logging and status tracking - -## Getting Started - -1. **Choose your backend**: Start with {ref}`launcher-orchestrated-local` for development -2. **Configure your model**: Set deployment type and model path -3. **Run evaluation**: Use the launcher to deploy and evaluate -4. **Monitor progress**: Check status and logs during execution -5. **Analyze results**: Export and analyze evaluation outcomes - -## Next Steps - -- **Local Development**: Start with {ref}`launcher-orchestrated-local` for testing -- **Scale Up**: Move to {ref}`launcher-orchestrated-slurm` for production workloads -- **Cloud Native**: Try {ref}`launcher-orchestrated-lepton` for managed infrastructure -- **Configure Adapters**: Set up {ref}`adapters` for custom processing - -```{toctree} -:maxdepth: 1 -:hidden: - -Local Deployment -Slurm Deployment -Lepton Deployment -``` - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_nim.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# How to use: -# -# 1. copy this file locally or clone the repository -# 2. set the NIM image and model name for your model -# 3. replace /path/to/nim/cache with the absolute path to the NIM cache directory on your machine -# 4. (optional) comment out limit_samples to run on the full dataset -# 5. run `nemo-evaluator-launcher run --config path/to/local_nim.yaml` -# -# ⚠️ WARNING: -# Always run full evaluations (without limit_samples) for actual benchmark results. -# Using a subset of samples is solely for testing configuration and setup. -# Results from such test runs should NEVER be used to compare models or -# report benchmark performance. - -defaults: - - execution: local - - deployment: nim - - _self_ - -execution: - output_dir: nel-results-nim - mounts: - deployment: - # Replace /path/to/nim/cache with the absolute path to the NIM cache directory on your machine - /path/to/nim/cache: /opt/nim/.cache - -# NIM deployment configuration -# Note: model_id is auto-derived from deployment.served_model_name -deployment: - image: nvcr.io/nim/meta/llama-3.2-1b-instruct:latest - served_model_name: meta/llama-3.2-1b-instruct - env_vars: - NGC_API_KEY: host:NGC_API_KEY - -# Specify the benchmarks to evaluate -evaluation: - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - parallelism: 4 - limit_samples: 10 # TEST ONLY: remove for full evaluation - tasks: - - name: lm-evaluation-harness.ifeval - - name: gsm8k - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/slurm_nim.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# How to use: -# -# 1. copy this file locally or clone the repository -# 2. set the required values (marked with ???) or pass them via -o cli arguments, e.g. -# -o execution.hostname=my-cluster.com -o execution.output_dir=/path/on/cluster -o execution.account=my-account -# 3. replace /path/to/nim/cache with the absolute path to the NIM cache directory on the cluster -# 4. (optional) run with 10 samples for quick testing: -# -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10 -# 5. run full evaluation: -# nemo-evaluator-launcher run --config path/to/slurm_nim.yaml -# -# ⚠️ WARNING: -# Always run full evaluations (without limit_samples) for actual benchmark results. -# Using a subset of samples is solely for testing configuration and setup. -# Results from such test runs should NEVER be used to compare models or -# report benchmark performance. - -defaults: - - execution: slurm/default - - deployment: nim - - _self_ - -# SLURM execution configuration -execution: - hostname: ??? # SLURM headnode hostname (required) - account: ??? # SLURM account (required) - output_dir: ??? # ABSOLUTE path on cluster (required) - mounts: - deployment: - # Replace /path/to/nim/cache with the absolute path to the NIM cache directory on the cluster - /path/to/nim/cache: /opt/nim/.cache - -# NIM deployment configuration -deployment: - image: nvcr.io/nim/meta/llama-3.2-1b-instruct:latest - served_model_name: meta/llama-3.2-1b-instruct - env_vars: - NGC_API_KEY: host:NGC_API_KEY - -# Specify the benchmarks to evaluate -evaluation: - tasks: - - name: lm-evaluation-harness.ifeval - - name: gsm8k - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_vllm_logprobs.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# How to use: -# -# 1. copy this file locally or clone the repository -# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing -# 3. run `nemo-evaluator-launcher run --config path/to/local_vllm_logprobs.yaml` - -# ⚠️ WARNING: -# Always run full evaluations (without limit_samples) for actual benchmark results. -# Using a subset of samples is solely for testing configuration and setup. -# Results from such test runs should NEVER be used to compare models or -# report benchmark performance. - -# [docs-start-snippet] -defaults: - - execution: local - - deployment: vllm - - _self_ - -execution: - output_dir: llama_local -deployment: - checkpoint_path: null - hf_model_handle: meta-llama/Llama-3.1-8B - served_model_name: meta-llama/Llama-3.1-8B - tensor_parallel_size: 1 - data_parallel_size: 1 - extra_args: "--max-model-len 32768" - env_vars: - HF_TOKEN: host:HF_TOKEN - -# specify the benchmarks to evaluate -evaluation: - # global config settings that apply to all tasks, unless overridden by task-specific config - nemo_evaluator_config: - config: - params: - request_timeout: 3600 # timeout for API request in seconds - parallelism: 1 # 1 parallel request to avoid overloading the server - # limit_samples: 10 # uncomment to limit number of samples for quick testing - extra: # for log-probability tasks like piqa, you need to specify the tokenizer - tokenizer: meta-llama/Llama-3.1-8B # or use a path to locally stored checkpoint - tokenizer_backend: huggingface # or "tiktoken" - env_vars: - HF_TOKEN: host:HF_TOKEN # needed to access the tokenizer on the client side - tasks: - - name: piqa - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/default.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -defaults: - - execution: local - - deployment: none - - _self_ - -# Top-level env vars applied to all jobs (deployment + evaluation). -# Values use explicit prefixes: "host:VAR_NAME", "lit:value", "runtime:VAR_NAME". -# Section-level and task-level env_vars override these. -env_vars: {} - -# NOTE(dfridman): If deployment is used, `target` parameters will be automatically populated. -target: - api_endpoint: - url: ??? - model_id: ??? - api_key_name: "" # NOTE: the name of the env var - -evaluation: [] - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment/nim.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -type: nim -image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6 -served_model_name: ??? -port: 8000 - -command: /opt/nim/start_server.sh - -# NIM containers use default entrypoint - no custom command needed -# Configuration is done via environment variables in lepton_config - -endpoints: - chat: /v1/chat/completions - completions: /v1/completions - health: /v1/health/ready -# Note: Environment variables should be configured in lepton_config.envs -# Auto-derived environment variables from deployment config: -# - SERVED_MODEL_NAME (from served_model_name) -# - NIM_MODEL_NAME (from served_model_name for NIM) -# - MODEL_PORT (from port) - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -type: vllm -image: vllm/vllm-openai:latest -checkpoint_path: ??? -served_model_name: ??? -port: 8000 -tensor_parallel_size: 8 -pipeline_parallel_size: 1 -data_parallel_size: 1 -gpu_memory_utilization: 0.95 -extra_args: "" -env_vars: {} # {name: value} dict - -endpoints: - chat: /v1/chat/completions - completions: /v1/completions - health: /health - -command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint} - --tensor-parallel-size=${deployment.tensor_parallel_size} - --pipeline-parallel-size=${deployment.pipeline_parallel_size} - --data-parallel-size=${deployment.data_parallel_size} - --port ${deployment.port} - --trust-remote-code - --served-model-name ${deployment.served_model_name} - --gpu-memory-utilization ${deployment.gpu_memory_utilization} - ${deployment.extra_args} - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/deployment/nim.yaml -```yaml -defaults: - - deployment: nim - -execution: - env_vars: - deployment: - NGC_API_KEY: $NGC_API_KEY # Required for NIM container authentication - mounts: - deployment: - /path/to/nim/cache: /opt/nim/.cache # Replace with absolute path to NIM cache directory - -deployment: - image: ??? # NIM image (e.g., nvcr.io/nim/meta/llama-3.2-1b-instruct:latest) - served_model_name: ??? # Model name (e.g., meta/llama-3.2-1b-instruct) - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/deployment/vllm.yaml -```yaml -defaults: - - deployment: vllm - -execution: - env_vars: - deployment: - HF_TOKEN: $HF_TOKEN # Required for gated HuggingFace models - -deployment: - checkpoint_path: null # Set to path if using local checkpoint - hf_model_handle: ??? # HuggingFace model handle (e.g., meta-llama/Llama-3.1-8B) - served_model_name: ??? # Model name for API (e.g., meta-llama/Llama-3.1-8B) - tensor_parallel_size: 1 - data_parallel_size: 1 - extra_args: "--max-model-len 32768" - -``` - diff --git a/skills/nemotron-customize/context/eval-standard-nlu.txt b/skills/nemotron-customize/context/eval-standard-nlu.txt deleted file mode 100644 index 95a43ad69..000000000 --- a/skills/nemotron-customize/context/eval-standard-nlu.txt +++ /dev/null @@ -1,1920 +0,0 @@ - -/Users/mromeijn/src/Evaluator -├── docs -│ └── evaluation -│ ├── benchmarks -│ │ ├── catalog -│ │ │ └── index.md * -│ │ └── about.md * -│ ├── run-evals -│ │ ├── index.md * -│ │ ├── logprobs.md * -│ │ ├── reasoning.md * -│ │ └── text-gen.md * -│ ├── index.md * -│ └── parameters.md * -└── packages - └── nemo-evaluator-launcher - ├── examples - │ ├── local_basic.yaml * - │ └── local_reasoning.yaml * - └── src - └── nemo_evaluator_launcher - └── resources - └── config_templates - └── evaluation - ├── base - │ ├── default.yaml * - │ └── standard.yaml * - └── chat - └── default.yaml * - - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/index.md -```md -(evaluation-overview)= - -# About Evaluation - -Evaluate LLMs, VLMs, agentic systems, and retrieval models across 100+ benchmarks using unified workflows. - -## Before You Start - -Before you run evaluations, ensure you have: - -1. **Chosen your approach**: See {ref}`get-started-overview` for installation and setup guidance -2. **Deployed your model**: See {ref}`deployment-overview` for deployment options -3. **OpenAI-compatible endpoint**: Your model must expose a compatible API (see {ref}`deployment-testing-compatibility`). -4. **API credentials**: Access tokens for your model endpoint and Hugging Face Hub. - ---- - -## Quick Start: Academic Benchmarks - -:::{admonition} Fastest path to evaluate academic benchmarks -:class: tip - -**For researchers and data scientists**: Evaluate your model on standard academic benchmarks in 3 steps. - -**Step 1: Choose Your Approach** -- **Launcher CLI** (Recommended): `nemo-evaluator-launcher run --config packages/nemo-evaluator-launcher/examples/local_basic.yaml` -- **Python API**: Direct programmatic control with `evaluate()` function - -**Step 2: Select Benchmarks** - -Common academic suites: -- **General Knowledge**: `mmlu_pro`, `gpqa_diamond` -- **Mathematical Reasoning**: `AIME_2025`, `mgsm` -- **Instruction Following**: `ifbench`, `mtbench` - - - -Discover all available tasks: -```bash -nemo-evaluator-launcher ls tasks -``` - -**Step 3: Run Evaluation** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: mmlu_pro - - name: ifbench -``` - -Launch the job: - -```bash -export NGC_API_KEY=nvapi-... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - - -::: - ---- - -## Evaluation Workflows - -Select a workflow based on your environment and desired level of control. - -::::{grid} 1 2 2 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`terminal;1.5em;sd-mr-1` Launcher Workflows -:link: ../get-started/quickstart/launcher -:link-type: doc -Unified CLI for running evaluations across local, Slurm, and cloud backends with built-in result export. -::: - -:::{grid-item-card} {octicon}`code;1.5em;sd-mr-1` Core API Workflows -:link: ../libraries/nemo-evaluator/workflows/python-api -:link-type: doc -Programmatic evaluation using Python API for integration into ML pipelines and custom workflows. -::: - -:::{grid-item-card} {octicon}`package;1.5em;sd-mr-1` Container Workflows -:link: ../libraries/nemo-evaluator/containers/index -:link-type: doc -Direct container access for specialized use cases and custom evaluation environments. -::: - -:::: - -## Configuration and Customization - -Configure your evaluations, create custom tasks, explore benchmarks, and extend the framework with these guides. - -::::{grid} 1 2 2 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`plus;1.5em;sd-mr-1` Configuration Parameters -:link: parameters -:link-type: doc -Comprehensive reference for evaluation configuration parameters and framework-specific settings. -::: - -:::{grid-item-card} {octicon}`list-unordered;1.5em;sd-mr-1` Benchmark Catalog -:link: eval-benchmarks -:link-type: ref -Explore 100+ available benchmarks across 18 evaluation harnesses and their specific use cases. -::: - -:::{grid-item-card} {octicon}`plus;1.5em;sd-mr-1` Extend Framework -:link: ../libraries/nemo-evaluator/extending/framework-definition-file/index -:link-type: doc -Add custom evaluation frameworks using Framework Definition Files for specialized benchmarks. -::: - -:::: - -## Advanced Features - -Scale your evaluations, export results, customize adapters, and resolve issues with these advanced features. - -::::{grid} 1 2 2 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`workflow;1.5em;sd-mr-1` Multi-Backend Execution -:link: ../libraries/nemo-evaluator-launcher/configuration/executors/index -:link-type: doc -Run evaluations on local machines, HPC clusters, or cloud platforms with unified configuration. -::: - -:::{grid-item-card} {octicon}`database;1.5em;sd-mr-1` Result Export -:link: ../libraries/nemo-evaluator-launcher/exporters/index -:link-type: doc -Export evaluation results to MLflow, Weights & Biases, Google Sheets, and other platforms. -::: - -:::{grid-item-card} {octicon}`shield;1.5em;sd-mr-1` Adapter System -:link: ../libraries/nemo-evaluator/interceptors/index -:link-type: doc -Configure request/response processing, logging, caching, and custom interceptors. -::: - -:::: - -## Core Evaluation Concepts - -- For architectural details and core concepts, refer to {ref}`evaluation-model`. -- For container specifications, refer to {ref}`nemo-evaluator-containers`. - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/benchmarks/about.md -```md -(eval-benchmarks)= - -# About Selecting Benchmarks - -NeMo Evaluator provides a comprehensive suite of benchmarks spanning academic reasoning, code generation, safety testing, and domain-specific evaluations. Whether you're validating a new model's capabilities or conducting rigorous academic research, you'll find the right benchmarks to assess your AI system's performance. -See {ref}`benchmarks-full-list` for the complete catalog of available benchmarks. - -## Available via Launcher - -```{literalinclude} ../_snippets/commands/list_tasks.sh -:language: bash -:start-after: "# [snippet-start]" -:end-before: "# [snippet-end]" -``` - -## Available via Direct Container Access - -```{literalinclude} ../_snippets/commands/list_tasks_core.sh -:language: bash -:start-after: "# [snippet-start]" -:end-before: "# [snippet-end]" -``` - -## Choosing Benchmarks for Academic Research - -:::{admonition} Benchmark Selection Guide -:class: tip - -**For General Knowledge**: -- `mmlu_pro` - Expert-level knowledge across 14 domains -- `gpqa_diamond` - Graduate-level science questions - -**For Mathematical & Quantitative Reasoning**: -- `AIME_2025` - American Invitational Mathematics Examination (AIME) 2025 questions -- `mgsm` - Multilingual math reasoning - -**For Instruction Following & Alignment**: -- `ifbench` - Precise instruction following -- `mtbench` - Multi-turn conversation quality - -See benchmark categories below and {ref}`benchmarks-full-list` for more details. -::: - -## Benchmark Categories - -### **Academic and Reasoning** - -```{list-table} -:header-rows: 1 -:widths: 20 30 30 50 - -* - Container - - Description - - NGC Catalog - - Benchmarks -* - **simple-evals** - - Common evaluation tasks - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/simple-evals) - - GPQA-D, MATH-500, AIME 24 & 25, HumanEval, HumanEval+, MGSM, MMLU (also multilingual), MMLU-Pro, MMLU-lite (AR, BN, DE, EN, ES, FR, HI, ID, IT, JA, KO, MY, PT, SW, YO, ZH), SimpleQA, BrowseComp, HealthBench -* - **lm-evaluation-harness** - - Language model benchmarks - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/lm-evaluation-harness) - - ARC Challenge (also multilingual), GSM8K, HumanEval, HumanEval+, MBPP, MBPP+, MINERVA Math, RACE, AGIEval, BBH, BBQ, CSQA, Frames, Global MMLU, GPQA-D, HellaSwag (also multilingual), IFEval, MGSM, MMLU, MMLU-Pro, MMLU-ProX (de, es, fr, it, ja), MMLU-Redux, MUSR, OpenbookQA, Piqa, Social IQa, TruthfulQA, WikiLingua, WinoGrande -* - **hle** - - Academic knowledge and problem solving - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/hle) - - HLE -* - **ifbench** - - Instruction following - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/ifbench) - - IFBench -* - **mtbench** - - Multi-turn conversation evaluation - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mtbench) - - MT-Bench -* - **nemo-skills** - - Language model benchmarks (science, math, agentic) - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/nemo_skills) - - AIME 24 & 25, BFCL_v3, GPQA, HLE, LiveCodeBench, MMLU, MMLU-Pro -* - **profbench** - - Evaluation of professional knowledge accross Physics PhD, Chemistry PhD, Finance MBA and Consulting MBA - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mtbench) - - Report Gerenation, LLM Judge -``` - -:::{note} -BFCL tasks from the nemo-skills container require function calling capabilities. See {ref}`deployment-testing-compatibility` for checking if your endpoint is compatible. -::: - -**Example Usage:** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: ifeval - - name: gsm8k_cot_instruct - - name: gpqa_diamond -``` - -Run evaluation: - -```bash -export NGC_API_KEY=nvapi-... -export HF_TOKEN=hf_... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - -### **Code Generation** - -```{list-table} -:header-rows: 1 -:widths: 20 30 30 50 - -* - Container - - Description - - NGC Catalog - - Benchmarks -* - **bigcode-evaluation-harness** - - Code generation evaluation - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bigcode-evaluation-harness) - - MBPP, MBPP-Plus, HumanEval, HumanEval+, Multiple (cpp, cs, d, go, java, jl, js, lua, php, pl, py, r, rb, rkt, rs, scala, sh, swift, ts) -* - **livecodebench** - - Coding - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/livecodebench) - - LiveCodeBench (v1-v6, 0724_0125, 0824_0225) -* - **scicode** - - Coding for scientific research - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/scicode) - - SciCode -``` - -**Example Usage:** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: humaneval_instruct - - name: mbbp -``` - -Run evaluation: - -```bash -export NGC_API_KEY=nvapi-... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - -### **Safety and Security** - -```{list-table} -:header-rows: 1 -:widths: 20 30 30 50 - -* - Container - - Description - - NGC Catalog - - Benchmarks -* - **garak** - - Safety and vulnerability testing - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/garak) - - Garak -* - **safety-harness** - - Safety and bias evaluation - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/safety-harness) - - Aegis v2, WildGuard -``` - -**Example Usage:** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: aegis_v2 - - name: garak -``` - -Run evaluation: - -```bash -export NGC_API_KEY=nvapi-... -export HF_TOKEN=hf_... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - -### **Function Calling** - -```{list-table} -:header-rows: 1 -:widths: 20 30 30 50 - -* - Container - - Description - - NGC Catalog - - Benchmarks -* - **bfcl** - - Function calling - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bfcl) - - BFCL v2 and v3 -* - **tooltalk** - - Tool usage evaluation - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk) - - ToolTalk -``` - -:::{note} -Some of the tasks in this category require function calling capabilities. See {ref}`deployment-testing-compatibility` for checking if your endpoint is compatible. -::: - -**Example Usage:** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: bfclv2_ast_prompting - - name: tooltalk -``` - -Run evaluation: - -```bash -export NGC_API_KEY=nvapi-... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - - -### **Vision-Language Models** - -```{list-table} -:header-rows: 1 -:widths: 20 30 30 50 - -* - Container - - Description - - NGC Catalog - - Benchmarks -* - **vlmevalkit** - - Vision-language model evaluation - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit) - - AI2D, ChartQA, MMMU, MathVista-MINI, OCRBench, SlideVQA -``` - -:::{note} -The tasks in this category require a VLM chat endpoint. See {ref}`deployment-testing-compatibility` for checking if your endpoint is compatible. -::: - -**Example Usage:** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: ocrbench - - name: chartqa -``` - -Run evaluation: - -```bash -export NGC_API_KEY=nvapi-... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - -### **Domain-Specific** - -```{list-table} -:header-rows: 1 -:widths: 20 30 30 50 - -* - Container - - Description - - NGC Catalog - - Benchmarks -* - **helm** - - Holistic evaluation framework - - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/helm) - - MedHelm -``` - -**Example Usage:** - -Create `config.yml`: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -evaluation: - tasks: - - name: pubmed_qa - - name: medcalc_bench -``` - -Run evaluation: - -```bash -export NGC_API_KEY=nvapi-... - -nemo-evaluator-launcher run \ - --config ./config.yml \ - -o execution.output_dir=results \ - -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \ - -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \ - -o +target.api_endpoint.api_key_name=NGC_API_KEY -``` - -## Container Details - -For detailed specifications of each container, see {ref}`nemo-evaluator-containers`. - -### Quick Container Access - -Pull and run any evaluation container directly: - -```bash -# Academic benchmarks -docker pull nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} -docker run --rm -it nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} - -# Code generation -docker pull nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:{{ docker_compose_latest }} -docker run --rm -it nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:{{ docker_compose_latest }} - -# Safety evaluation -docker pull nvcr.io/nvidia/eval-factory/safety-harness:{{ docker_compose_latest }} -docker run --rm -it nvcr.io/nvidia/eval-factory/safety-harness:{{ docker_compose_latest }} -``` - -### Available Tasks by Container - -For a complete list of available tasks in each container: - -```bash -# List tasks in any container -docker run --rm nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} nemo-evaluator ls - -# Or use the launcher for unified access -nemo-evaluator-launcher ls tasks -``` - -## Integration Patterns - -NeMo Evaluator provides multiple integration options to fit your workflow: - -```bash -# Launcher CLI (recommended for most users) -nemo-evaluator-launcher ls tasks -nemo-evaluator-launcher run --config ./local_mmlu_evaluation.yaml - -# Container direct execution -docker run --rm nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} nemo-evaluator ls - -# Python API (for programmatic control) -# See the Python API documentation for details -``` - -## Benchmark Selection Best Practices - -### For Model Development - -**Iterative Testing**: -- Start with `limit_samples=100` for quick feedback during development -- Run full evaluations before major releases -- Track metrics over time to measure improvement - -**Configuration**: -```python -# Development testing -params = ConfigParams( - limit_samples=100, # Quick iteration - temperature=0.01, # Deterministic - parallelism=4 -) - -# Production evaluation -params = ConfigParams( - limit_samples=None, # Full dataset - temperature=0.01, # Deterministic - parallelism=8 # Higher throughput -) -``` - -### For Specialized Domains - -- **Code Models**: Focus on `humaneval`, `mbpp`, `livecodebench` -- **Instruction Models**: Emphasize `ifbench`, `mtbench` -- **Multilingual Models**: Include `arc_multilingual`, `hellaswag_multilingual`, `mgsm` -- **Safety-Critical**: Prioritize `safety-harness` and `garak` evaluations - - -## Next Steps - -- **Container Details**: Browse {ref}`nemo-evaluator-containers` for complete specifications -- **Custom Benchmarks**: Learn {ref}`framework-definition-file` for custom evaluations - - - - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/benchmarks/catalog/index.md -```md -(benchmarks-full-list)= -# Available Benchmarks - - -```{include} all/benchmarks-table.md -``` - - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/index.md -```md -(eval-run)= - -# Evaluation Techniques - -Follow step-by-step guides for different evaluation scenarios and methodologies in NeMo Evaluator. - -## Before You Start - -Ensure you have: - -1. Completed the initial getting started guides for {ref}`gs-install` and {ref}`gs-quickstart`. -2. Have your endpoint and API key ready or prepared for the checkpoint you wish to deploy. -3. Prepared your [Hugging Face token](https://huggingface.co/docs/hub/en/security-tokens) for accessing gated datasets. - - -## Evaluations - -Select an evaluation type tailored to your model capabilities. - -::::{grid} 1 2 2 2 -:gutter: 1 1 1 2 - -:::{grid-item-card} {octicon}`pencil;1.5em;sd-mr-1` Text Generation -:link: text-gen -:link-type: ref -Measure model performance through natural language generation for academic benchmarks, reasoning tasks, and general knowledge assessment. -::: - -:::{grid-item-card} {octicon}`graph;1.5em;sd-mr-1` Log-Probability -:link: logprobs -:link-type: ref -Assess model confidence and uncertainty using log-probabilities for multiple-choice scenarios without text generation. -::: - -:::{grid-item-card} {octicon}`comment;1.5em;sd-mr-1` Reasoning -:link: run-eval-reasoning -:link-type: ref -Control the thinking budget and post-process the responses to extract the reasoning content and the final answer -::: - - -:::: - - - - - - - -:::{toctree} -:hidden: -Text Generation -Log Probability -Reasoning -::: - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/text-gen.md -```md -(text-gen)= - -# Text Generation Evaluation - -Text generation evaluation is the primary method for assessing LLM capabilities where models produce natural language responses to prompts. This approach evaluates the quality, accuracy, and appropriateness of generated text across various tasks and domains. - - -:::{tip} -In the example below we use the `gpqa_diamond` benchmark, but the instructions provided apply to all text generation tasks, such as: - -- `mmlu` -- `mmlu_pro` -- `ifeval` -- `gsm8k` -- `mgsm` -- `mbpp` - -::: - -## Before You Start - -Ensure you have: - -- **Model Endpoint**: An OpenAI-compatible API endpoint for your model (completions or chat). See {ref}`deployment-testing-compatibility` for snippets you can use to test your endpoint. -- **API Access**: Valid API key if your endpoint requires authentication -- **Installed Packages**: NeMo Evaluator or access to evaluation containers - -## Evaluation Approach - -In text generation evaluation: - -1. **Prompt Construction**: Models receive carefully crafted prompts (questions, instructions, or text to continue) -2. **Response Generation**: Models generate natural language responses using their trained parameters -3. **Response Assessment**: Generated text is evaluated for correctness, quality, or adherence to specific criteria -4. **Metric Calculation**: Numerical scores are computed based on evaluation criteria - -This differs from **log-probability evaluation** where models assign confidence scores to predefined choices. -For log-probability methods, see the {ref}`logprobs`. - - -## Use NeMo Evaluator Launcher - -Use an example config for evaluating the [Meta Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model: - -```{literalinclude} ../../../packages/nemo-evaluator-launcher/examples/local_basic.yaml -:language: yaml -:start-after: "[docs-start-snippet]" -``` - - -To launch the evaluation, run: - -```bash - -export HF_TOKEN_FOR_GPQA_DIAMOND=hf_your-token-here # GPQA is a gated dataset -export NGC_API_KEY=nvapi-your-token-here # API Key with access to build.nvidia.com - -nemo-evaluator-launcher run \ - --config packages/nemo-evaluator-launcher/examples/local_basic.yaml -``` - - -## Use NeMo Evaluator - -Start `simple-evals` docker container: - -```bash -docker run --rm -it nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} -``` - -or install `nemo-evaluator` and `nvidia-simple-evals` Python package in your environment of choice: - -```bash -pip install nemo-evaluator nvidia-simple-evals -``` - -### Run with CLI - -```bash -export HF_TOKEN_FOR_GPQA_DIAMOND=hf_your-token-here # GPQA is a gated dataset -export NGC_API_KEY=nvapi-your-token-here # API Key with access to build.nvidia.com - -# Run evaluation -nemo-evaluator run_eval \ - --eval_type gpqa_diamond \ - --model_id meta/llama-3.2-3b-instruct \ - --model_url https://integrate.api.nvidia.com/v1/chat/completions \ - --model_type chat \ - --api_key_name NGC_API_KEY \ - --output_dir ./llama_3_1_8b_instruct_results -``` - -### Run with Python API - -```python -# set env variables before entering Python: -# export HF_TOKEN_FOR_GPQA_DIAMOND=hf_your-token-here # GPQA is a gated dataset -# export NGC_API_KEY=nvapi-your-token-here # API Key with access to build.nvidia.com - -from nemo_evaluator.core.evaluate import evaluate -from nemo_evaluator.api.api_dataclasses import ( - ApiEndpoint, EvaluationConfig, EvaluationTarget, ConfigParams, EndpointType -) - -# Configure target endpoint -api_endpoint = ApiEndpoint( - url="https://integrate.api.nvidia.com/v1/chat/completions", - type=EndpointType.CHAT, - model_id="meta/llama-3.2-3b-instruct", - api_key="NGC_API_KEY" # variable name storing the key -) -target = EvaluationTarget(api_endpoint=api_endpoint) - -# Configure evaluation task -config = EvaluationConfig( - type="gpqa_diamond", - output_dir="./llama_3_1_8b_instruct_results" -) - -# Execute evaluation -results = evaluate(target_cfg=target, eval_cfg=config) -``` - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/logprobs.md -```md -(logprobs)= -# Evaluate LLMs Using Log-Probabilities - -## Introduction - -While the most typical approach to LLM evaluation involves assessing the quality of a model's generated response to a question, an alternative method uses **log-probabilities**. - -In this approach, we quantify a model's "surprise" or uncertainty when processing a text sequence. -This is done by calculating the sum of log-probabilities that the model assigns to each token. -A higher sum indicates the model is more confident about the sequence. - -In this evaluation approach: -* The LLM is given a single combined text containing both the question and a potential answer. -* Next, the sum of log-probabilities is calculated only for the tokens that belong to the answer. -* This allows an assessment of how likely it is that the model would provide that answer for the given question. - -For multiple-choice scenarios, the answer with the highest sum is treated as the one selected by the model. - -The sum of log-probabilities can be used to calculate different metrics, such as **perplexity**. -Additionally, log-probabilities can be analyzed to assess whether a response would be generated by the model using greedy sampling—a method commonly employed to evaluate **accuracy**. - -Using log-probabilities is especially useful for evaluating base (pre-trained) models, as it eliminates the need for complex instruction-following and does not require the model to adhere to a specific output format. - -:::{tip} -In the example below we use the `piqa` benchmark, but the instructions provided apply to all `lm-evaluation-harness` tasks utilizing log-probabilities, such as: - -- arc_challenge -- arc_multilingual -- bbh -- commonsense_qa -- hellaswag -- hellaswag_multilingual -- musr -- openbookqa -- social_iqa -- truthfulqa -- winogrande -::: - -## Before You Start - -Ensure you have: - -- **Completions Endpoint**: Log-probability tasks require completions endpoints (not chat) that supports `logprobs` and `echo` parameters (see {ref}`compatibility-log-probs`) -- **Model Tokenizer**: Access to tokenizer files for client-side tokenization (supported types: `huggingface` or `tiktoken`) -- **API Access**: Valid API key for your model endpoint if it is gated -- **Authentication**: Hugging Face token for gated datasets and tokenizers - - -## Use NeMo Evaluator Launcher - -Use an example config for deploying and evaluating the [Meta Llama 3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) model: - -```{literalinclude} ../../../packages/nemo-evaluator-launcher/examples/local_vllm_logprobs.yaml -:language: yaml -:start-after: "[docs-start-snippet]" -``` - -To launch the evaluation, run: - -```bash -nemo-evaluator-launcher run \ - --config packages/nemo-evaluator-launcher/examples/local_vllm_logprobs.yaml -``` - -:::{tip} -Set `deployment: none` and provide `target` specification if you want to evaluate an existing endpoint instead: - -```yaml -defaults: - - execution: local - - deployment: none - - _self_ - -execution: - output_dir: llama_local - env_vars: - HF_TOKEN: ${oc.env:HF_TOKEN} # needed to access meta-llama/Llama-3.1-8B gated model - -target: - api_endpoint: - model_id: meta-llama/Llama-3.1-8B - url: https://your-endpoint.com/v1/completions - api_key_name: NGC_API_KEY # API Key with access to provided url - -# specify the benchmarks to evaluate -evaluation: - nemo_evaluator_config: # global config settings that apply to all tasks - config: - params: - extra: # for log-probability tasks like piqa, you need to specify the tokenizer - tokenizer: meta-llama/Llama-3.1-8B # or use a path to locally stored checkpoint - tokenizer_backend: huggingface # or "tiktoken" - tasks: - - name: piqa - -``` -::: - -## Use NeMo Evaluator - -Start `lm-evaluation-harness` docker container: - -```bash -docker run --rm -it nvcr.io/nvidia/eval-factory/lm-evaluation-harness:{{ docker_compose_latest }} -``` - -or install `nemo-evaluator` and `nvidia-lm-eval` Python package in your environment of choice: - -```bash -pip install nemo-evaluator nvidia-lm-eval -``` - - -To launch the evaluation, run the following Python code: - -```{literalinclude} ../_snippets/piqa_hf.py -:language: python -:start-after: "# [snippet-start]" -:end-before: "# [snippet-end]" -``` - -Make sure to provide the source for the tokenizer and a backend for loading it. - -For models trained with NeMo Framework, the tokenizer is stored inside the checkpoint directory. -For the NeMo format it is available inside `context/nemo_tokenizer` subdirectory: - -```python - extra={ - "tokenizer": "/workspace/llama3_8b_nemo2/context/nemo_tokenizer", - "tokenizer_backend": "huggingface", - }, -``` - -For Megatron Bridge checkpoints, the tokenizer is stored under `tokenizer` subdirectory: - -```python - extra={ - "tokenizer": "/workspace/mbridge_llama3_8b/iter_0000000/tokenizer", - "tokenizer_backend": "huggingface", - }, -``` - - -## How it works - -When the server receives a `logprob=` parameter in the request, it will return the log-probabilities of tokens. -When combined with `echo=true`, the model will include the input in its response, along with the corresponding log-probabilities. - -Then the recieved response is processed on the client (benchmark) side to isolate the log-probabilities corresponding specifically to the answer portion of the input. -For this purpose the input is tokenized, which allows to trace which log-probabilities originated from the question, and which from the answer. - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/reasoning.md -```md -(run-eval-reasoning)= -# Evaluation of Reasoning Models - -Reasoning models require a distinct approach compared to standard language models. Their outputs are typically longer, may contain dedicated reasoning tokens, and are more susceptible to generating loops or repetitive sequences. Evaluating these models effectively requires custom parameter settings and careful handling of generation constraints. - -## Before You Start - -Ensure you have: - -- **Model Endpoint**: An OpenAI-compatible API endpoint for your model (completions or chat). See {ref}`deployment-testing-compatibility` for snippets you can use to test your endpoint. -- **API Access**: Valid API key if your endpoint requires authentication -- **Installed Packages**: NeMo Evaluator or access to evaluation containers - - -## Recommended Settings - -### Generation Settings - -Below are recommended generation settings for some popular reasoning-optimized models. These configurations should be included in the **model card**: - -| Model | Temperature | Top-p | Top-k | -|---------------------|-------------|--------|--------| -| **NVIDIA Nemotron** | 0.6 | 0.95 | — | -| **DeepSeek R1** | 0.6 | 0.95 | — | -| **Qwen 230B** | 0.6 | 0.95 | 20 | -| **Phi-4 Reasoning** | 0.8 | 0.95 | 50 | - - -### Token Configuration - -- `max_new_tokens` must be **significantly increased** for reasoning tasks as it includes the length of both reasoning trace and the final answer. -- Check the model card to see settings recommended by the model creators. -- It is important to observe if the specified `max_new_tokens` is enough for the model to finish reasoning. - -:::{tip} -You can verify successful reasoning completion in the logs via the {ref}`interceptor-reasoning` Interceptor, for example: - -``` -[I 2025-12-02T16:14:28.257] Reasoning tracking information reasoning_words=1905 original_content_words=85 updated_content_words=85 reasoning_finished=True reasoning_started=True reasoning_tokens=unknown updated_content_tokens=unknown logger=ResponseReasoningInterceptor request_id=ccff76b2-2b85-4eed-a9d0-2363b533ae58 -``` -::: - -## Reasoning Output Formats - -Reasoning models produce outputs that contain both the **reasoning trace** (the model's step-by-step thinking process) and the **final answer**. The reasoning trace typically includes intermediate thoughts, calculations, and logical steps before arriving at the conclusion. - -There are two main ways to structure reasoning output: - -### 1. Wrapped with reasoning tokens - -e.g. - -``` -... -``` - -``` - ... -``` - -or - -``` - ... -``` - -Most of the benchmarks expect only the final answer to be present in model's response. -If your model endpoint replies with reasoning trace present in the main content, it needs to be removed from the assistant messages. -You can do it using the {ref}`interceptor-reasoning` Interceptor. -The interceptor will remove reasoning trace from the content and (optionally) track statistics for reasoning traces. - -:::{note} -The `ResponseReasoningInterceptor` is by default configured for the `...` and ` ...` format. If your model uses these special tokens, you do not need to modify anything in your configuration. -::: - -### 2. Returned as `reasoning_content` field in messages output - -If your model is deployed with e.g. vLLM, sglang or NIM, the reasoning part of the model's output is likely returned in the separate `reasoning_content` field in messages output (see [vLLM documentation](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html) and [sglang documentation](https://sgl-project.github.io/advanced_features/separate_reasoning.html)). - -In the messages returned by the endpoint, there are: - -- `reasoning_content`: The reasoning part of the output. -- `content`: The content of the final answer. - -Conversely to the first method, this setup does not require any extra response parsing. -However, in some benchmarks, errors may appear if the reasoning has not finished and the benchmark does not support empty answers in `content`. - -#### Enabling reasoning parser in vLLM - -To enable the `reasoning_content` field in vLLM, you need to pass the `--reasoning-parser` argument to the vLLM server. -In NeMo Evaluator Launcher, you can do this via `deployment.extra_args`: - -```yaml -deployment: - hf_model_handle: Qwen/Qwen3-Next-80B-A3B-Thinking - extra_args: "--reasoning-parser deepseek_r1" -``` - -Available reasoning parsers depend on your vLLM version. Common options include `deepseek_r1` for models using `...` format. -See the [vLLM reasoning outputs documentation](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html) for details. - ---- - -## Control the Reasoning Effort - -Some models allow turning reasoning on/off or setting its level of effort. There are usually 2 ways of doing it: - -- **Special instruction in the system prompt** -- **Extra parameters passed to the chat_template** - -:::{tip} -Check the model card and documentation of the deployment of your choice to see how you can control the reasoning effort for your model. -If there are several options available, it is recommended to use the dedicated chat template parameters over the system prompt. -::: - -### Control reasoning with the system prompt - -In this example we will use the [NVIDIA-Nemotron-Nano-9B-v2](https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard) model. -This model allows you to control the reasoning effort by including `/think` or `/no_think` in the system prompt, e.g.: - - -```json -{ - "model": "nvidia/nvidia-nemotron-nano-9b-v2", - "messages": [ - {"role": "system", "content": "You are a helpful assistant. /think"}, - {"role": "user", "content": "What is 2+2?"} - ], - "temperature": 0.6, - "top_p": 0.95, - "max_tokens": 32768 -} -``` - -When launching the evaluation, we can use the {ref}`interceptor-system-messages` Interceptor to add `/think` or `/no_think` to the system prompt. - - -```yaml -config: - params: - temperature: 0.6 - top_p: 0.95 - max_new_tokens: 32768 # for reasoning + final answer -target: - api_endpoint: - adapter_config: - process_reasoning_traces: true # strips reasoning tokens and collects reasoning stats - use_system_prompt: true # turn reasoning on with special system prompt - custom_system_prompt: >- - "/think" -``` - - -### Control reasoning with additional parameters - -In this example we will use the [Granite-3.3-8B-Instruct](https://build.nvidia.com/ibm/granite-3_3-8b-instruct/modelcard) model. -Conversely to NVIDIA-Nemotron-Nano-9B-v2, this model allows you to turn the reasoning on with an additional `thinking` parameter passed to the chat template: - -```json -{ - "model": "ibm/granite-3.3-8b-instruct", - "messages": [ - { - "role": "user", - "content": "What is 2+2?" - } - ], - "temperature": 0.2, - "top_p": 0.7, - "max_tokens": 8192, - "seed": 42, - "stream": true, - "chat_template_kwargs": { - "thinking": true - } -} -``` - -When running the evaluation, use the {ref}`interceptor-payload-modification` Interceptor to add this parameter to benchmarks' requests: - -```yaml -config: - params: - temperature: 0.6 - top_p: 0.95 - max_new_tokens: 32768 # for reasoning + final answer -target: - api_endpoint: - adapter_config: - process_reasoning_traces: true - params_to_add: - chat_template_kwargs: - thinking: true -``` - - -## Benchmarks for Reasoning - -Reasoning models excel at tasks that require multi-step thinking, logical deduction, and complex problem-solving. The following benchmark categories are particularly well-suited for evaluating reasoning capabilities: - - -- **CoT tasks**: e.g., AIME, Math, GPQA-diamond -- **Coding**: e.g., scicodebench, livedocebench - - -:::{tip} -When evaluating your model on a task that does not require step-by-step thinking, consider turning the reasoning off or lowering the thinking budget. -::: - - -## Full Working Example - -### Run the evaluation - -An example config is available in `packages/nemo-evaluator-launcher/examples/local_reasoning.yaml`: - -```{literalinclude} ../../../packages/nemo-evaluator-launcher/examples/local_reasoning.yaml -:language: yaml -:start-after: "[docs-start-snippet]" -``` - -To launch the evaluation, run: - -```bash -export NGC_API_KEY=nvapi-... -nemo-evaluator-launcher run \ - --config packages/nemo-evaluator-launcher/examples/local_reasoning.yaml -``` - -### Analyze the artifacts - -NeMo Evaluator produces several artifacts for analysis after evaluation completion. -The primary output file is `results.yaml`, which stores the metrics produced by the benchmark (see {ref}`evaluation-output` for more details). - -The `eval_factory_metrics.json` file provides valuable insights into your model's behavior. -When the reasoning interceptor is enabled, this file contains a `reasoning` key that stores statistics about reasoning traces in your model's responses: - -```json -"reasoning": { - "description": "Reasoning statistics saved during processing", - "total_responses": 3672, - "responses_with_reasoning": 2860, - "reasoning_finished_count": 2860, - "reasoning_finished_ratio": 1.0, - "reasoning_started_count": 2860, - "reasoning_unfinished_count": 0, - "avg_reasoning_words": 153.21, - "avg_original_content_words": 192.17, - "avg_updated_content_words": 38.52, - "max_reasoning_words": 806, - "max_original_content_words": 863, - "max_updated_content_words": 863, - "max_reasoning_tokens": null, - "avg_reasoning_tokens": null, - "max_updated_content_tokens": null, - "avg_updated_content_tokens": null, - "total_reasoning_words": 561696, - "total_original_content_words": 705555, - "total_updated_content_words": 140999, - "total_reasoning_tokens": 0, - "total_updated_content_tokens": 0 - }, -``` - -In the example above, the model used reasoning for 2860 out of 3672 responses (approximately 78%). - -The matching values for `reasoning_started_count` and `reasoning_finished_count` (and `reasoning_unfinished_count` being 0) indicate that the `max_new_tokens` parameter was set sufficiently high, allowing the model to complete all reasoning traces without truncation. - -These statistics also enable cost analysis for reasoning operations. -While the endpoint in this example does not return reasoning token usage statistics (the `*_tokens` fields are null or zero), you can still analyze computational cost using the word count metrics from the responses. - -For more information on available artifacts, see {ref}`evaluation-output`. - -``` - -File: /Users/mromeijn/src/Evaluator/docs/evaluation/parameters.md -```md -(eval-parameters)= - -# Evaluation Configuration Parameters - -Comprehensive reference for configuring evaluation tasks in {{ product_name_short }}, covering universal parameters, framework-specific settings, and optimization patterns. - -:::{admonition} Quick Navigation -:class: info - -**Looking for available benchmarks?** -- {ref}`eval-benchmarks` - Browse available benchmarks by category - -**Need help getting started?** -- {ref}`evaluation-overview` - Overview of evaluation workflows -- {ref}`eval-run` - Step-by-step evaluation guides -::: - -## Overview - -All evaluation tasks in {{ product_name_short }} use the {ref}`ConfigParams ` class for configuration. This provides a consistent interface across different evaluation harnesses while allowing framework-specific customization through the `extra` parameter. Default configuration (including which parameters a task uses) is defined in the **Framework Definition File (FDF)** for each framework; see {ref}`framework-definition-file` for details. - - -```python -from nemo_evaluator.api.api_dataclasses import ConfigParams - -# Basic configuration -params = ConfigParams( - temperature=0, - top_p=1.0, - max_new_tokens=256, - limit_samples=100 -) - -# With framework-specific parameters (extra) -params = ConfigParams( - temperature=0, - parallelism=8, - extra={ - "num_fewshot": 5, - "tokenizer": "/path/to/tokenizer", - "custom_prompt": "Answer the question:" - } -) -``` - -:::{admonition} How to see possible parameters for a given task -:class: important - -**Python API (core)** — Get default params and which params a task uses. Use `framework_name.task_name` to avoid ambiguity when the same task name exists in multiple harnesses: - -```python -from nemo_evaluator.core.input import get_available_evaluations - -# Returns (framework_evals_mapping, framework_defaults, all_eval_name_mapping) -framework_evals, _, _ = get_available_evaluations() - -# Use framework_name.task_name (e.g. simple_evals.mmlu_pro) for a single task -framework_name, task_name = "simple_evals", "mmlu_pro" -eval_obj = framework_evals[framework_name][task_name] - -# Default params for this task (ConfigParams / dict-like) -print(eval_obj.config.params) - -# Command template shows which {{ config.params.* }} the task uses -print(eval_obj.command) -``` - -**CLI (core)** — List tasks, then show merged config (including params) for a task: - -```bash -# List available tasks -nemo-evaluator ls - -# Show full rendered config (including config.params) for a task without running -# Use framework_name.task_name (e.g. simple_evals.mmlu_pro) to avoid ambiguity -nemo-evaluator run_eval --eval_type simple_evals.mmlu_pro --model_id x --model_url https://example.com/v1/chat/completions --model_type chat --output_dir ./out --dry_run -``` - -The `--dry_run` output prints the merged configuration (YAML) and the rendered command, so you can see which parameters apply to that task. - -**Launcher** — If you use the launcher, `nemo-evaluator-launcher ls task ` (or `harness.task_name`) prints task details including **Defaults** with `config.params` and `config.params.extra`. List all tasks with `nemo-evaluator-launcher ls tasks`. -::: - -## Universal Parameters - -These parameters are standardized across all frameworks and share the same names and semantics. That does **not** mean every framework supports every parameter: each task’s command template only uses a subset. If you pass a parameter that the task does not use, you will see a warning like: *"Configuration contains parameter(s) that are not used in the command template"* (see `validate_params_in_command` in `nemo_evaluator.core.utils`). - -```{list-table} -:header-rows: 1 -:widths: 12 14 10 28 22 14 - -* - Category - - Parameter - - Type - - Description - - Example Values - - Notes -* - Sampling - - `temperature` - - `float` - - Sampling randomness - - `0` (deterministic), `0.7` (creative) - - Use `0` for reproducible results -* - Sampling - - `top_p` - - `float` - - Nucleus sampling threshold - - `1.0` (disabled), `0.9` (selective) - - Controls diversity of generated text -* - Sampling - - `max_new_tokens` - - `int` - - Maximum response length - - `256`, `512`, `1024` - - Limits generation length -* - Evaluation control - - `limit_samples` - - `int/float` - - Evaluation subset size - - `100` (count), `0.1` (10% of dataset) - - Use for quick testing or resource limits -* - Evaluation control - - `task` - - `str` - - Task-specific identifier - - `"custom_task"` - - Used by some harnesses for task routing -* - Performance - - `parallelism` - - `int` - - Concurrent request threads - - `1`, `8`, `16` - - Balance against server capacity -* - Performance - - `max_retries` - - `int` - - Retry attempts for failed requests - - `3`, `5`, `10` - - Increases robustness for network issues -* - Performance - - `request_timeout` - - `int` - - Request timeout (seconds) - - `60`, `120`, `300` - - Adjust for model response time -``` - -## Framework-Specific Parameters - -Framework-specific parameters are passed through the `extra` dictionary within `ConfigParams`. - -::::{dropdown} LM-Evaluation-Harness Parameters -:icon: code-square - -```{list-table} -:header-rows: 1 -:widths: 15 10 30 25 20 - -* - Parameter - - Type - - Description - - Example Values - - Use Cases -* - `num_fewshot` - - `int` - - Few-shot examples count - - `0`, `5`, `25` - - Academic benchmarks -* - `tokenizer` - - `str` - - Tokenizer path - - `"/path/to/tokenizer"` - - Log-probability tasks -* - `tokenizer_backend` - - `str` - - Tokenizer implementation - - `"huggingface"`, `"sentencepiece"` - - Custom tokenizer setups -* - `trust_remote_code` - - `bool` - - Allow remote code execution - - `True`, `False` - - For custom tokenizers -* - `add_bos_token` - - `bool` - - Add beginning-of-sequence token - - `True`, `False` - - Model-specific formatting -* - `add_eos_token` - - `bool` - - Add end-of-sequence token - - `True`, `False` - - Model-specific formatting -* - `fewshot_delimiter` - - `str` - - Separator between examples - - `"\\n\\n"`, `"\\n---\\n"` - - Custom prompt formatting -* - `fewshot_seed` - - `int` - - Reproducible example selection - - `42`, `1337` - - Ensures consistent few-shot examples -* - `description` - - `str` - - Custom prompt prefix - - `"Answer the question:"` - - Task-specific instructions -* - `bootstrap_iters` - - `int` - - Statistical bootstrap iterations - - `1000`, `10000` - - For confidence intervals -``` - -:::: - -::::{dropdown} Simple-Evals Parameters -:icon: code-square - -```{list-table} -:header-rows: 1 -:widths: 15 10 30 25 20 - -* - Parameter - - Type - - Description - - Example Values - - Use Cases -* - `pass_at_k` - - `list[int]` - - Code evaluation metrics - - `[1, 5, 10]` - - Code generation tasks -* - `timeout` - - `int` - - Code execution timeout - - `5`, `10`, `30` - - Code generation tasks -* - `max_workers` - - `int` - - Parallel execution workers - - `4`, `8`, `16` - - Code execution parallelism -* - `languages` - - `list[str]` - - Target programming languages - - `["python", "java", "cpp"]` - - Multi-language evaluation -``` - -:::: - -::::{dropdown} BigCode-Evaluation-Harness Parameters -:icon: code-square - -```{list-table} -:header-rows: 1 -:widths: 15 10 30 25 20 - -* - Parameter - - Type - - Description - - Example Values - - Use Cases -* - `num_workers` - - `int` - - Parallel execution workers - - `4`, `8`, `16` - - Code execution parallelism -* - `eval_metric` - - `str` - - Evaluation metric - - `"pass_at_k"`, `"bleu"` - - Different scoring methods -* - `languages` - - `list[str]` - - Programming languages - - `["python", "javascript"]` - - Language-specific evaluation -``` - -:::: - -::::{dropdown} Safety and Specialized Harnesses -:icon: code-square - -```{list-table} -:header-rows: 1 -:widths: 15 10 30 25 20 - -* - Parameter - - Type - - Description - - Example Values - - Use Cases -* - `probes` - - `str` - - Garak security probes - - `"ansiescape.AnsiEscaped"` - - Security evaluation -* - `detectors` - - `str` - - Garak security detectors - - `"base.TriggerListDetector"` - - Security evaluation -* - `generations` - - `int` - - Number of generations per prompt - - `1`, `5`, `10` - - Safety evaluation -``` - -:::: - -## Parameter Selection Guidelines - -- Configure `parallelism` and `request_timeout` based on server capacity. -- Use `limit_samples` for subset evaluation (e.g. for debugging or quick validation). - -## Common Configuration Errors - -### Tokenizer Issues - -:::{admonition} Problem -:class: error -Missing tokenizer for log-probability tasks - -```python -# Incorrect - missing tokenizer -params = ConfigParams(extra={}) -``` -::: - -:::{admonition} Solution -:class: tip -Always specify tokenizer for log-probability tasks - -```python -# Correct -params = ConfigParams( - extra={ - "tokenizer_backend": "huggingface", - "tokenizer": "/path/to/nemo_tokenizer" - } -) -``` -::: - -### Performance Issues - -:::{admonition} Problem -:class: error -Excessive parallelism overwhelming server - -```python -# Incorrect - too many concurrent requests -params = ConfigParams(parallelism=100) -``` -::: - -:::{admonition} Solution -:class: tip -Start conservative and scale up - -```python -# Correct - reasonable concurrency -params = ConfigParams(parallelism=8, max_retries=3) -``` -::: - -### Parameter Conflicts - -:::{admonition} Problem -:class: error -Mixing generation and log-probability parameters - -```python -# Incorrect - generation params unused for log-probability -params = ConfigParams( - temperature=0.7, # Ignored for log-probability tasks - extra={"tokenizer": "/path"} -) -``` -::: - -:::{admonition} Solution -:class: tip -Use appropriate parameters for task type - -```python -# Correct - only relevant parameters -params = ConfigParams( - limit_samples=100, # Relevant for all tasks - extra={"tokenizer": "/path"} # Required for log-probability -) -``` -::: - -## Next Steps - -- **Basic Usage**: See {ref}`text-gen` for getting started -- **Custom Tasks**: Learn {ref}`eval-custom-tasks` for specialized evaluations -- **Troubleshooting**: Refer to {ref}`troubleshooting-index` for common issues -- **Benchmarks**: Browse {ref}`eval-benchmarks` for task-specific recommendations - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_basic.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# How to use: -# -# 1. copy this file locally or clone the repository -# 2. (optional) comment out limit_samples in the config file to run on the full dataset -# 3. run `nemo-evaluator-launcher run --config path/to/local_basic.yaml` -# -# ⚠️ WARNING: -# Always run full evaluations (without limit_samples) for actual benchmark results. -# Using a subset of samples is solely for testing configuration and setup. -# Results from such test runs should NEVER be used to compare models or -# report benchmark performance. - -# [docs-start-snippet] -defaults: - - execution: local - - deployment: none - - _self_ - -execution: - output_dir: nel-results - -target: - api_endpoint: - # see https://build.nvidia.com/meta/llama-3_1-8b-instruct for endpoint details - model_id: meta/llama-3.2-3b-instruct - url: https://integrate.api.nvidia.com/v1/chat/completions - api_key_name: NGC_API_KEY # API Key with access to build.nvidia.com - -# specify the benchmarks to evaluate -evaluation: - # global config settings that apply to all tasks, unless overridden by task-specific config - nemo_evaluator_config: - config: - params: - request_timeout: 3600 # timeout for API request in seconds - parallelism: 1 # 1 parallel request to avoid overloading the server - limit_samples: 10 # TEST ONLY: Limits all benchmarks to 10 samples for quick testing - tasks: - - name: lm-evaluation-harness.ifeval - - name: simple_evals.gpqa_diamond - env_vars: - HF_TOKEN: host:HF_TOKEN_FOR_GPQA_DIAMOND # Click request access for GPQA-Diamond: https://huggingface.co/datasets/Idavidrein/gpqa - - name: bigcode-evaluation-harness.mbpp-chat - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_reasoning.yaml -```yaml -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# How to use: -# -# 1. copy this file locally or clone the repository -# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing -# 3. run `nemo-evaluator-launcher run --config path/to/local_reasoning.yaml` - -# ⚠️ WARNING: -# Always run full evaluations (without limit_samples) for actual benchmark results. -# Using a subset of samples is solely for testing configuration and setup. -# Results from such test runs should NEVER be used to compare models or -# report benchmark performance. - -# [docs-start-snippet] -defaults: - - execution: local - - deployment: none - - _self_ - -execution: - output_dir: nel-results - -target: - api_endpoint: - # see https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2 for endpoint details - model_id: nvidia/nvidia-nemotron-nano-9b-v2 - url: https://integrate.api.nvidia.com/v1/chat/completions - api_key_name: NGC_API_KEY # API Key with access to build.nvidia.com - -evaluation: - # global config settings that apply to all tasks, unless overridden by task-specific config - nemo_evaluator_config: - config: - params: - request_timeout: 3600 # timeout for API request in seconds - parallelism: 1 # 1 parallel request to avoid overloading the server - # limit_samples: 10 # uncomment to limit number of samples for quick testing - - tasks: - # run complex tasks with reasoning on - - name: simple_evals.mmlu_pro - nemo_evaluator_config: - config: - params: - temperature: 0.6 - top_p: 0.95 - max_new_tokens: 32768 # for reasoning + final answer - target: - api_endpoint: - adapter_config: - process_reasoning_traces: true # strips reasoning tokens and collects reasoning stats - use_system_prompt: true # turn reasoning on with special system prompt - custom_system_prompt: >- - "/think" - - # run simpler tasks with reasoning off - - name: lm-evaluation-harness.ifeval - nemo_evaluator_config: - config: - params: - max_new_tokens: 1024 # we can use less tokens with reasoning off - target: - api_endpoint: - adapter_config: - use_system_prompt: true # turn reasoning off with special system prompt - custom_system_prompt: >- - "/no_think" - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/evaluation/base/default.yaml -```yaml -# Base model evaluation configuration (completions endpoint) -# Uses log-probabilities for multiple-choice tasks -# See: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/run-evals/logprobs.html -evaluation: - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - parallelism: 64 - extra: - # Tokenizer required for log-probability tasks - tokenizer: ??? # HuggingFace model handle or path to tokenizer (must match evaluated model) - tokenizer_backend: huggingface # or "tiktoken" - env_vars: - HF_TOKEN: HF_TOKEN # Required to access gated tokenizers - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/evaluation/base/standard.yaml -```yaml -# Standard LLM Benchmarks for base models (completions endpoint, log-probability based) -# These tasks use log-probabilities to assess model confidence on answer choices -evaluation: - tasks: - - name: lm-evaluation-harness.mmlu # Log-prob based multiple choice - - name: lm-evaluation-harness.gpqa # Log-prob based (completions version) - - name: lm-evaluation-harness.arc_challenge # Log-prob based - - name: lm-evaluation-harness.hellaswag # Log-prob based - - name: lm-evaluation-harness.commonsense_qa # Log-prob based - -``` - -File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/evaluation/chat/default.yaml -```yaml -# Chat model evaluation configuration (chat endpoint) -evaluation: - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - parallelism: 64 - temperature: 0.0 # Deterministic for reproducibility - max_new_tokens: 2048 - -``` - diff --git a/skills/nemotron-customize/context/mbridge-parallelism-performance.txt b/skills/nemotron-customize/context/mbridge-parallelism-performance.txt deleted file mode 100644 index cc00ec1a0..000000000 --- a/skills/nemotron-customize/context/mbridge-parallelism-performance.txt +++ /dev/null @@ -1,6791 +0,0 @@ - -/Users/mromeijn/src/Megatron-Bridge -├── docs -│ ├── training -│ │ ├── images -│ │ ├── activation-recomputation.md * -│ │ ├── communication-overlap.md * -│ │ ├── hybrid-context-parallel.md * -│ │ ├── megatron-fsdp.md * -│ │ ├── mixed-precision.md * -│ │ └── packed-sequences.md * -│ ├── images -│ ├── modelopt -│ ├── models -│ │ ├── llm -│ │ └── vlm -│ ├── releases -│ ├── parallelisms.md * -│ └── performance-guide.md * -├── skills -│ ├── perf-techniques -│ │ ├── cuda-graphs -│ │ │ └── SKILL.md * -│ │ ├── parallelism-strategies -│ │ │ └── SKILL.md * -│ │ ├── sequence-packing -│ │ │ └── SKILL.md * -│ │ ├── tp-dp-comm-overlap -│ │ │ └── SKILL.md * -│ │ ├── expert-parallel-overlap -│ │ ├── hybrid-context-parallel -│ │ ├── megatron-fsdp -│ │ ├── moe-comm-overlap -│ │ └── packed-sequences-long-context -│ ├── adding-model-support -│ ├── code-style -│ ├── developer-guide -│ ├── mlm-bridge-training -│ ├── multi-node-slurm -│ ├── parity-testing -│ └── resiliency -├── src -│ └── megatron -│ └── bridge -│ ├── training -│ │ ├── comm_overlap.py * + -│ │ ├── config.py * + -│ │ ├── initialize.py * + -│ │ ├── ... -│ ├── data -│ │ └── ... -│ ├── diffusion -│ │ └── ... -│ ├── inference -│ │ └── ... -│ ├── models -│ │ └── ... -│ ├── peft -│ ├── recipes -│ │ └── ... -│ └── utils -├── .github -│ ├── ISSUE_TEMPLATE -│ ├── actions -│ │ └── test-template -│ └── workflows -│ └── config -├── .specstory -├── 3rdparty -│ └── Megatron-LM -│ ├── .github -│ │ ├── ISSUE_TEMPLATE -│ │ ├── actions -│ │ │ └── ... -│ │ ├── scripts -│ │ └── workflows -│ │ └── ... -│ ├── .gitlab -│ │ ├── scripts -│ │ └── stages -│ ├── docker -│ │ ├── common -│ │ └── patches -│ ├── docs -│ │ ├── advanced -│ │ ├── api-guide -│ │ │ └── ... -│ │ ├── developer -│ │ ├── discussions -│ │ │ └── ... -│ │ ├── get-started -│ │ ├── images -│ │ │ └── ... -│ │ ├── models -│ │ └── user-guide -│ │ └── ... -│ ├── examples -│ │ ├── academic_paper_scripts -│ │ │ └── ... -│ │ ├── bert -│ │ ├── export -│ │ │ └── ... -│ │ ├── gpt3 -│ │ ├── inference -│ │ │ └── ... -│ │ ├── llama -│ │ ├── mamba -│ │ ├── mimo -│ │ │ └── ... -│ │ ├── mixtral -│ │ ├── multimodal -│ │ │ └── ... -│ │ ├── post_training -│ │ │ └── ... -│ │ ├── rl -│ │ │ └── ... -│ │ └── t5 -│ ├── images -│ ├── megatron -│ │ ├── core -│ │ │ └── ... -│ │ ├── inference -│ │ ├── legacy -│ │ │ └── ... -│ │ ├── post_training -│ │ ├── rl -│ │ │ └── ... -│ │ └── training -│ │ └── ... -│ ├── scripts -│ ├── tasks -│ ├── tests -│ │ ├── functional_tests -│ │ │ └── ... -│ │ ├── test_utils -│ │ │ └── ... -│ │ └── unit_tests -│ │ └── ... -│ └── tools -│ ├── bert_embedding -│ └── checkpoint -├── docker -│ ├── common -│ └── patches -├── examples -│ ├── conversion -│ │ ├── adapter -│ │ └── compare_hf_and_megatron -│ ├── decentralized_pg -│ ├── diffusion -│ │ └── recipes -│ │ ├── flux -│ │ │ └── ... -│ │ └── wan -│ │ └── ... -│ ├── distillation -│ │ └── llama -│ │ └── conf -│ ├── evaluation -│ │ └── utils -│ ├── inference -│ │ └── vlm -│ ├── long_context -│ ├── models -│ │ ├── audio_lm -│ │ │ ├── qwen2_audio -│ │ │ └── qwen3_asr -│ │ ├── bailing -│ │ ├── gpt_oss -│ │ ├── minimax_m2 -│ │ ├── nemotron_3 -│ │ │ ├── nano -│ │ │ └── super -│ │ ├── qwen3_next -│ │ │ └── conf -│ │ ├── sarvam -│ │ └── vlm -│ │ ├── gemma3_vl -│ │ ├── glm_45v -│ │ ├── kimi_k25_vl -│ │ ├── ministral3 -│ │ ├── nemotron_vl -│ │ │ └── ... -│ │ ├── qwen25_omni -│ │ ├── qwen35_vl -│ │ ├── qwen3_vl -│ │ └── qwen_vl -│ │ └── ... -│ ├── peft -│ ├── quantization -│ │ └── conf -│ ├── resiliency -│ │ ├── fault_tolerance -│ │ └── straggler_detection -│ └── rl -├── scripts -│ ├── performance -│ │ ├── configs -│ │ │ ├── deepseek -│ │ │ ├── gpt_oss -│ │ │ ├── kimi -│ │ │ ├── llama -│ │ │ ├── nemotronh -│ │ │ ├── qwen -│ │ │ └── qwen_vl -│ │ └── utils -│ └── training -├── tests -│ ├── functional_tests -│ │ ├── data -│ │ │ ├── energon -│ │ │ └── hf_processors -│ │ ├── diffusion -│ │ │ ├── flux -│ │ │ └── wan -│ │ ├── inference -│ │ ├── launch_scripts -│ │ │ ├── active -│ │ │ └── flaky -│ │ ├── models -│ │ │ ├── qwen3_asr -│ │ │ └── qwen_audio -│ │ └── test_groups -│ │ ├── ckpts -│ │ │ └── ... -│ │ ├── converter -│ │ ├── data -│ │ │ └── ... -│ │ ├── diffusion -│ │ │ └── ... -│ │ ├── models -│ │ │ └── ... -│ │ ├── quantization -│ │ │ └── ... -│ │ ├── recipes -│ │ ├── training -│ │ └── utils -│ └── unit_tests -│ ├── data -│ │ ├── builders -│ │ ├── datasets -│ │ ├── energon -│ │ ├── mimo -│ │ └── vlm_datasets -│ ├── diffusion -│ │ ├── data -│ │ │ └── ... -│ │ ├── model -│ │ │ └── ... -│ │ └── recipes -│ │ └── ... -│ ├── inference -│ │ └── vlm -│ ├── models -│ │ ├── common -│ │ ├── decorators -│ │ ├── deepseek -│ │ ├── gemma -│ │ ├── gemma_vl -│ │ ├── glm -│ │ ├── glm_vl -│ │ ├── gpt -│ │ ├── gpt_oss -│ │ ├── hf_pretrained -│ │ ├── kimi -│ │ ├── kimi_vl -│ │ ├── llama -│ │ ├── llama_nemotron -│ │ ├── mamba -│ │ ├── mimo -│ │ ├── minimax_m2 -│ │ ├── ministral3 -│ │ ├── mistral -│ │ ├── nemotron -│ │ ├── nemotron_vl -│ │ ├── nemotronh -│ │ ├── olmoe -│ │ ├── qwen -│ │ ├── qwen3_asr -│ │ │ └── ... -│ │ ├── qwen_audio -│ │ ├── qwen_omni -│ │ │ └── ... -│ │ ├── qwen_vl -│ │ │ └── ... -│ │ └── sarvam -│ ├── peft -│ ├── recipes -│ │ ├── gemma -│ │ ├── gpt -│ │ ├── kimi -│ │ ├── nemotronh -│ │ ├── qwen -│ │ ├── qwen_vl -│ │ │ └── ... -│ │ └── utils -│ ├── scripts -│ │ └── performance -│ ├── training -│ │ ├── mimo -│ │ ├── mlm_compat -│ │ ├── post_training -│ │ └── utils -│ └── utils -└── tutorials - ├── data - │ └── dclm - ├── recipes - │ └── llama - │ └── conf - └── training - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; depth cap 3; selected files shown. - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/process_groups_config.py -Imports: - - from dataclasses import dataclass, field, fields - - from functools import partial - - from typing import List, Optional - - import torch - - from megatron.core import parallel_state - - from megatron.core.utils import get_model_config - - import logging - - from megatron.core.utils import log_single_rank ---- -Classes: - - ProcessGroupHelperMeta - Methods: - - L17: def __setattr__(cls, name, value): - - ProcessGroupCollection - Methods: - - L136: def __init__(self, **kwargs): - - L143: def __repr__(self): - - L161: def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None): - - L253: def setup_process_groups_for_optimizer( - pg_collection: Optional['ProcessGroupCollection'], - model_chunks: List, - use_gloo_process_groups: bool = True, - ): - - L444: def setup_process_groups_for_ddp( - pg_collection: Optional['ProcessGroupCollection'], config, ddp_config - ): - Properties: - - tp - - pp - - mp - - embd - - pos_embd - - cp - - tp_cp - - hcp - - ep - - expt_tp - - tp_ep - - tp_ep_pp - - tp_dp_cp - - dp - - dp_cp - - expt_dp - - intra_dp_cp - - intra_expt_dp - - inter_dist_opt - - intra_dist_opt ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/training/resilience_config.py -Imports: - - from dataclasses import dataclass - - from typing import Literal ---- -Classes: - - RerunStateMachineConfig - Properties: - - error_injection_rate - - error_injection_type - - rerun_mode - - check_for_nan_in_loss - - check_for_spiky_loss - - StragglerDetectionConfig - Properties: - - log_straggler - - straggler_ctrlr_port - - straggler_minmax_count - - disable_straggler_on_startup ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/training/common_config.py -Imports: - - from dataclasses import dataclass, field - - from typing import Literal - - import os ---- -Classes: - - RNGConfig - Properties: - - seed - - te_rng_tracker - - inference_rng_tracker - - data_parallel_random_init - - ProfilingConfig - Properties: - - use_nsys_profiler - - profile_step_start - - profile_step_end - - use_pytorch_profiler - - pytorch_profiler_collect_shapes - - pytorch_profiler_collect_callstack - - pytorch_profiler_collect_chakra - - profile_ranks - - record_memory_history - - memory_snapshot_path - - record_shapes - - nvtx_ranges - - DistributedInitConfig - Properties: - - distributed_backend - - distributed_timeout_minutes - - align_grad_reduce - - local_rank - - lazy_mpu_init - - use_megatron_fsdp - - use_torch_fsdp2 - - nccl_communicator_config_path - - use_tp_pp_dp_mapping - - enable_gloo_process_groups - - use_sharp - - sharp_enabled_group - - high_priority_stream_groups - - distributed_timeout_seconds_after_init - - disable_jit_fuser ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/distributed/distributed_data_parallel_config.py -Imports: - - from dataclasses import dataclass - - from typing import Optional - - import os ---- -Classes: - - DistributedDataParallelConfig - Methods: - - L168: def __post_init__(self): - Properties: - - grad_reduce_in_fp32 - - overlap_grad_reduce - - overlap_param_gather - - align_param_gather - - use_distributed_optimizer - - num_distributed_optimizer_instances - - check_for_nan_in_grad - - check_for_large_grads - - bucket_size - - pad_buckets_for_high_nccl_busbw - - reduce_scatter_with_fp32_accumulation - - average_in_collective - - fp8_param_gather - - reuse_grad_buf_for_mxfp8_param_ag - - use_megatron_fsdp - - use_custom_fsdp - - data_parallel_sharding_strategy - - gradient_reduce_div_fusion - - suggested_communication_unit_size - - preserve_fp32_weights - - keep_fp8_transpose_cache - - nccl_ub - - fsdp_double_buffer - - fsdp_db_use_persist_buf_on_alloc_fail - - fsdp_all_gather_in_start_param_sync - - outer_dp_sharding_strategy - - disable_symmetric_registration - - fsdp_manual_registration - - delay_wgrad_compute ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/models/transformer_config.py -Imports: - - import copy - - from dataclasses import dataclass, fields, is_dataclass - - from megatron.core.transformer.heterogeneous.heterogeneous_config import ( - HeterogeneousTransformerConfig as MCoreHeterogeneousTransformerConfig, -) - - from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig - - from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig - - from megatron.bridge.utils.activation_map import str_to_callable - - from megatron.bridge.utils.activation_map import str_to_dtype ---- -Classes: - - TransformerConfig - Methods: - - L97: def __post_init__(self) -> None: - - L106: def finalize(self) -> None: - - L127: def __deepcopy__(self, memo): - - L146: def asdict(self) -> dict: - Properties: - - _NO_COPY_KEYS - - MLATransformerConfig - Methods: - - L172: def __post_init__(self) -> None: - - L181: def finalize(self) -> None: - - HeterogeneousTransformerConfig - Methods: - - L227: def __post_init__(self) -> None: - - L236: def finalize(self) -> None: - - L248: def get_config_for_layer(self, layer_number: int) -> MCoreTransformerConfig: - -Functions: - - L31: def _safe_asdict(obj, skip_keys: set[str]) -> dict: - - L51: def _resolve_string_fields(config: MCoreTransformerConfig) -> None: ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/optimizer/optimizer_config.py -Imports: - - import fnmatch - - from dataclasses import dataclass, field - - from typing import Callable, Optional, Tuple, Union - - import torch - - from ..utils import is_te_min_version - - import warnings - - import inspect - - from transformer_engine.pytorch.optimizers import FusedAdam as Adam ---- -Classes: - - ParamPredicate - Methods: - - L32: def __call__(self, param: torch.nn.Parameter) -> bool: - Properties: - - name - - fn - - ParamWithNamePredicate - Methods: - - L60: def __call__(self, param: torch.nn.Parameter, name: str) -> bool: - Properties: - - name - - fn - - ParamKey - Methods: - - L89: def matches(self, param: torch.nn.Parameter, param_name: str) -> bool: - Properties: - - name - - attr - - predicate - - with_name_predicate - - OptimizerConfig - Methods: - - L346: def __post_init__(self): - Properties: - - lr - - min_lr - - decoupled_lr - - decoupled_min_lr - - weight_decay - - apply_wd_to_qk_layernorm - - fp8_recipe - - fp16 - - bf16 - - reuse_grad_buf_for_mxfp8_param_ag - - params_dtype - - use_precision_aware_optimizer - - store_param_remainders - - main_grads_dtype - - main_params_dtype - - exp_avg_dtype - - exp_avg_sq_dtype - - optimizer - - loss_scale - - initial_loss_scale - - min_loss_scale - - loss_scale_window - - hysteresis - - adam_beta1 - - adam_beta2 - - adam_eps - - decoupled_weight_decay - - sgd_momentum - - muon_momentum - - muon_split_qkv - - muon_use_nesterov - - muon_scale_mode - - muon_fp32_matmul_prec - - muon_num_ns_steps - - muon_tp_mode - - muon_extra_scale_factor - - use_distributed_optimizer - - overlap_param_gather - - overlap_param_gather_with_optimizer_step - - optimizer_cpu_offload - - optimizer_offload_fraction - - use_torch_optimizer_for_cpu_offload - - overlap_cpu_optimizer_d2h_h2d - - pin_cpu_grads - - pin_cpu_params - - clip_grad - - log_num_zeros_in_grad - - barrier_with_L1_time - - timers - - config_logger_dir - - AdamOptimizerConfig - Properties: - - optimizer - - adam_beta1 - - adam_beta2 - - adam_eps - - SGDOptimizerConfig - Properties: - - optimizer - - sgd_momentum ---- - - -File: /Users/mromeijn/src/Nemotron/src/nemotron/kit/megatron_stub.py -Imports: - - from dataclasses import dataclass, field - - from pathlib import Path ---- -Classes: - - DataConfig - Properties: - - data_path - - mock - - seq_length - - micro_batch_size - - global_batch_size - - ModelConfig - Properties: - - name - - num_layers - - hidden_size - - num_attention_heads - - ffn_hidden_size - - vocab_size - - OptimizerConfig - Properties: - - lr - - min_lr - - weight_decay - - adam_beta1 - - adam_beta2 - - TrainingConfig - Properties: - - max_steps - - log_interval - - eval_interval - - save_interval - - fp16 - - bf16 - - CheckpointConfig - Properties: - - dir - - save_on_train_end - - resume_from - - ConfigContainer - Properties: - - data - - model - - optimizer - - training - - checkpoint ---- - - - -File: /Users/mromeijn/src/Megatron-Bridge/docs/parallelisms.md -```md -# Parallelisms Guide - -Megatron Bridge supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. These parallelism strategies are configured through model provider classes and leverage Megatron Core's implementation for performance and memory efficiency. - -## Data Parallelism - -Data Parallelism (DP) replicates the model across multiple GPUs. Data batches are evenly distributed between GPUs and the data-parallel GPUs process them independently. While the computation workload is efficiently distributed across GPUs, inter-GPU communication is required to keep the model replicas consistent between training steps. - -### Distributed Data Parallelism - -Distributed Data Parallelism (DDP) keeps the model copies consistent by synchronizing parameter gradients across data-parallel GPUs before each parameter update. More specifically, it sums the gradients of all model copies using all-reduce communication collectives. - -![Distributed Data Parallelism](images/ddp.gif) -*Figure: Distributed Data Parallelism synchronizes gradients across multiple GPUs using all-reduce operations.* - -### Distributed Optimizer - -[Distributed optimizer](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/dist_optimizer.html) is a memory-optimized data-parallel deployment method. It shards the optimizer states and the high-precision master parameters across data-parallel GPUs instead of replicating them. At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. This approach significantly reduces the memory need of large-scale LLM training. - -### Enable Data Parallelism - -In Megatron Bridge, DDP is the default parallel deployment method. The total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group. - -To enable the distributed optimizer, configure the {py:class}`bridge.training.config.OptimizerConfig` and {py:class}`bridge.training.config.DistributedDataParallelConfig` - -```python -from megatron.bridge.training.config import ConfigContainer, DistributedDataParallelConfig, OptimizerConfig - -optimizer_config = OptimizerConfig( - optimizer="adam", - lr=3e-4, - weight_decay=0.1, - adam_beta1=0.9, - adam_beta2=0.95, - use_distributed_optimizer=True, - clip_grad=1.0, -) -ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) - -config = ConfigContainer( - ddp=ddp_config, - optimizer=optimizer_config, - # ... other config parameters -) -``` - -For more optimizer options, refer to the {py:class}`bridge.training.config.OptimizerConfig` API documentation. - -## Model Parallelism - -Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need for per-GPU memory. Megatron Bridge supports various model-parallel methods through Megatron Core, which can be mixed to maximize LLM training performance. - -### Tensor Parallelism - -Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads. - -![Tensor Parallelism Overview](images/tp1.png) -*Figure 1: Tensor Parallelism distributes individual layer parameters across multiple GPUs.* - -![Tensor Parallelism Implementation](images/tp2.png) -*Figure 2: Detailed view of how tensor parallelism splits weight matrices and synchronizes computations.* - -#### Enable Tensor Parallelism - -To enable TP in Megatron Bridge, configure the `tensor_model_parallel_size` parameter in your model provider. This parameter determines the number of GPUs among which the model's tensors are partitioned. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer - -# Configure model with tensor parallelism -model_config = GPTModelProvider( - tensor_model_parallel_size=2, # Enable TP across 2 GPUs - # ... other model parameters -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Implement Tensor Parallelism - -Megatron Bridge integrates TP through the implementation from Megatron Core. For detailed API usage and additional configurations, consult the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.tensor_parallel.html). - -### Pipeline Parallelism - -Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially. - -![Pipeline Parallelism](images/pp.gif) -*Figure: Pipeline Parallelism distributes consecutive layers across multiple GPUs, processing batches in a pipeline fashion.* - -#### Enable Pipeline Parallelism - -To utilize Pipeline Parallelism in Megatron Bridge, set the `pipeline_model_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer - -# Configure model with pipeline parallelism -model_config = GPTModelProvider( - pipeline_model_parallel_size=4, # Distribute layers across 4 GPUs - # ... other model parameters -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Interleaved Pipeline Parallel Schedule - -To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. Enable this by setting `virtual_pipeline_model_parallel_size`: - -```python -model_config = GPTModelProvider( - pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=2, # 2 model chunks per pipeline stage - # ... other model parameters -) -``` - -For more insights into this approach, see the detailed blog: [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism). - -#### Implement Pipeline Parallelism - -The Megatron Bridge implementation of PP leverages functionalities from Megatron Core. For more detailed API usage and configurations related to PP, visit the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.pipeline_parallel.html). - -### Expert Parallelism and Mixture of Experts (MoE) - -Expert Parallelism (EP) is a type of model parallelism that distributes experts of a Mixture of Experts (MoE) model across GPUs. Unlike other model-parallel techniques, EP is applied to only the expert layers and does not impact the parallel mapping of the rest of the layers. - -MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert focuses on a specific subtask or domain, while a gating network dynamically activates the most appropriate expert based on the current input. - -![Expert Parallelism](images/ep.png) -*Figure: Expert Parallelism distributes MoE experts across multiple GPUs while keeping other layers replicated.* - -#### Basic MoE Configuration - -To enable MoE in Megatron Bridge, configure the basic MoE parameters in your model provider: - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure basic MoE model -model_config = GPTModelProvider( - num_moe_experts=8, # Number of experts in the MoE module - moe_router_topk=2, # Number of experts activated per token - moe_ffn_hidden_size=8192, # Hidden size for expert FFN layers - # ... other model parameters -) -``` - -#### Enable Expert Parallelism - -To enable EP, set `expert_model_parallel_size` in your model configuration. For example, if the model has eight experts (`num_moe_experts=8`), then setting `expert_model_parallel_size=4` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. - -```python -# Configure MoE model with expert parallelism -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, # Distribute 8 experts across 4 GPUs (2 experts per GPU) - # ... other model parameters -) -``` - -#### Enable Expert Tensor Parallelism - -To enable Expert Tensor Parallelism (ETP), set `expert_tensor_parallel_size` in your model configuration: - -```python -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - expert_tensor_parallel_size=2, # Apply tensor parallelism within each expert - # ... other model parameters -) -``` - -#### Advanced MoE Features - -Megatron Bridge provides several advanced optimizations for MoE models to improve performance on modern GPU architectures. - -##### DeepEP and HybridEP Optimizations - -DeepEP and HybridEP are high-performance MoE token dispatchers that improve throughput and efficiency on specific GPU architectures: - -- **DeepEP**: Optimized for Ampere, Hopper, B200, and B300 GPUs -- **HybridEP**: Optimized for GB200, GB300 with NVL72, and Ampere, Hopper, B200, B300 GPUs - -These dispatchers replace the standard token routing mechanism with an optimized "flex" dispatcher that provides better performance for MoE workloads. - -**Enable DeepEP:** - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend - -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - # ... other model parameters -) - -# Apply DeepEP optimization -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep") -``` - -**Enable HybridEP:** - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend - -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - # ... other model parameters -) - -# Apply HybridEP optimization -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="hybridep") -``` - -**GPU Architecture Requirements:** - -- **DeepEP**: Ampere (SM 8.x), Hopper (SM 9.x), B200, B300 -- **HybridEP**: GB200, GB300 with NVL72, Ampere (SM 8.x), Hopper (SM 9.x), B200, B300 - -The system automatically validates GPU compatibility and issues warnings if the dispatcher is not supported on the current hardware. - -##### Token Dropping for Load Balancing - -Token dropping improves MoE performance by balancing work across experts through capacity factors. This feature allows the model to drop tokens when experts are overloaded, preventing stragglers and improving overall throughput. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop - -model_config = GPTModelProvider( - num_moe_experts=8, - moe_router_topk=2, - moe_token_dispatcher_type="alltoall", # Required for token dropping - moe_router_load_balancing_type="aux_loss", # Required load balancing type - # ... other model parameters -) - -# Apply token dropping with capacity factor -apply_moe_token_drop( - model_config, - moe_expert_capacity_factor=1.0, # Capacity multiplier per expert - moe_pad_expert_input_to_capacity=True, # Pad inputs to capacity length -) -``` - -**Configuration Parameters:** - -- `moe_expert_capacity_factor`: Controls the maximum number of tokens each expert can process. A factor of 1.0 means each expert can handle exactly its proportional share of tokens. Lower values (e.g., 0.8) drop more tokens but improve load balancing. -- `moe_pad_expert_input_to_capacity`: When enabled, pads expert inputs to the capacity length for consistent batch sizes. - -**Requirements:** - -- Token dispatcher must be `alltoall` or `alltoall_seq` -- Load balancing type must be `aux_loss`, `seq_aux_loss`, or `none` - -**Trade-offs:** - -Token dropping can improve training throughput by 10-30% in imbalanced MoE models, but may affect convergence if too aggressive. Start with a capacity factor of 1.0 and gradually reduce if needed. - -#### Complete MoE Configuration Example - -Here's a complete example showing how to configure an MoE model with advanced optimizations: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend -from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop - -# Configure MoE model with expert parallelism -model_config = GPTModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - - # MoE configuration - num_moe_experts=8, # 8 experts total - moe_router_topk=2, # Activate 2 experts per token - moe_ffn_hidden_size=8192, # Expert FFN hidden dimension - moe_token_dispatcher_type="alltoall", # Token dispatcher type - moe_router_load_balancing_type="aux_loss", # Load balancing - - # Expert parallelism - expert_model_parallel_size=4, # Distribute experts across 4 GPUs - expert_tensor_parallel_size=2, # Apply TP within each expert - - # ... other model parameters -) - -# Apply DeepEP optimization (for Ampere/Hopper GPUs) -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep") - -# Apply token dropping for load balancing -apply_moe_token_drop( - model_config, - moe_expert_capacity_factor=1.0, - moe_pad_expert_input_to_capacity=True, -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Expert Parallelism Implementation - -The Megatron Bridge implementation of EP uses functionality from Megatron Core. Please consult the [Megatron Core MoE layer](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/moe_layer.py#L42) for more MoE implementation details. - -## Activation Partitioning - -In LLM training, a large memory space is needed to store the input activations of the network layers. Megatron Bridge provides effective activation distribution methods through Megatron Core, which is critical in training LLMs with large sequence lengths or large per-GPU micro-batch sizes. - -### Sequence Parallelism - -Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. - -![Sequence Parallelism](images/sp.png) -*Figure: Sequence Parallelism distributes the sequence dimension across multiple GPUs, reducing activation memory.* - -#### Enable Sequence Parallelism - -To utilize SP in Megatron Bridge, set the `sequence_parallel` parameter to `True` in your model configuration. Note that this feature is effective only when the tensor parallel size (`tensor_model_parallel_size`) is greater than `1`. - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure model with sequence parallelism -model_config = GPTModelProvider( - tensor_model_parallel_size=2, # Required for sequence parallelism - sequence_parallel=True, # Enable sequence parallelism - # ... other model parameters -) -``` - -#### Implement Sequence Parallelism - -The Megatron Bridge implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code: [Megatron-LM Sequence Parallel Source Code](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py). - -### Context Parallelism - -Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs by partitioning the input tensors along the sequence dimension. Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers. - -CP is critical for training long context models, as it allows the model to handle longer sequences by distributing the sequence activations across multiple GPUs. This method reduces the memory footprint and computational cost of processing long sequences. - -#### Enable Context Parallelism - -To activate CP in Megatron Bridge, set the `context_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs across which the model's sequence activations are distributed. - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure model with context parallelism -model_config = GPTModelProvider( - context_parallel_size=2, # Distribute sequence across 2 GPUs - # ... other model parameters -) -``` - -For long context training scenarios, context parallelism is particularly effective and essential for handling sequences that exceed the memory capacity of individual GPUs. - -#### Implement Context Parallelism - -Megatron Bridge leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. - -For more detailed technical information and implementation details, visit: -- [Megatron Core Context Parallelism Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/context_parallel.html) -- [Megatron Core wrappers for Transformer Engine](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py) -- [Transformer Engine attention modules](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) - -## Combined Parallelism Example - -Megatron Bridge allows you to combine multiple parallelism strategies for optimal performance and memory efficiency: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer, OptimizerConfig - -# Configure model with multiple parallelism strategies -model_config = GPTModelProvider( - # Model parallelism - tensor_model_parallel_size=2, # 2-way tensor parallelism - pipeline_model_parallel_size=4, # 4-way pipeline parallelism - virtual_pipeline_model_parallel_size=2, # Interleaved pipeline - - # Activation partitioning - sequence_parallel=True, # Enable sequence parallelism (requires TP > 1) - context_parallel_size=2, # 2-way context parallelism - - # Expert parallelism (for MoE models) - num_moe_experts=8, # 8 experts - expert_model_parallel_size=4, # Distribute experts across 4 GPUs - - # ... other model parameters -) - -# Configure distributed optimizer -optimizer_config = OptimizerConfig( - optimizer="adam", - use_distributed_optimizer=True, # Enable distributed optimizer - # ... other optimizer parameters -) - -config = ConfigContainer( - model=model_config, - optimizer=optimizer_config, - # ... other config parameters -) -``` - -## Data Parallel Size Calculation - -The data parallel size is automatically calculated based on the total world size and model parallelism settings: - -``` -data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size) -``` - -For example, with 32 GPUs total and the configuration above: -- `tensor_model_parallel_size = 2` -- `pipeline_model_parallel_size = 4` -- `context_parallel_size = 2` -- `data_parallel_size = 32 / (2 × 4 × 2) = 2` - -## Strategy Selection Guide - -Choosing the right combination depends on model size, hardware topology, -and sequence length. - -### Dense Models by Size - -| Model size | GPUs | Recommended starting point | -|---|---|---| -| < 1B | 1-8 | DP only | -| 1-10B | 8-16 | TP=2-4 + DP | -| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP | -| 70-175B | 64-256 | TP=8 + PP=4-8 + DP | -| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP | - -### MoE Models - -MoE models differ fundamentally from dense models: only a fraction of -parameters are active per token, so TP can often stay at 1 or 2. EP is -the primary scaling dimension. - -| Total / active params | Typical layout | -|---|---| -| < 20B | EP only (TP=1, PP=1) | -| 20-100B | TP=1-2 + PP=2-4 + EP=8-16 | -| 100-500B | TP=2-4 + PP=8-16 + EP=8-32 | -| 500B+ | TP=2 + PP=16 + EP=32-64 | - -### By Hardware Topology - -- **Single node with NVLink**: maximize TP within the node (up to TP=8). -- **Multiple nodes with InfiniBand**: keep TP within a node, use PP across nodes. -- **Limited network (Ethernet)**: minimize TP, prefer PP for cross-node scaling. - -### By Sequence Length - -| Sequence length | Recommendation | -|---|---| -| < 2K | standard TP + PP + DP | -| 2K-8K | add SP (`sequence_parallel=True`) | -| 8K-32K | add CP=2 | -| 32K+ | add CP=4-8, consider hierarchical CP | - -For operational details on configuring combined parallelism, troubleshooting -layouts, and memory estimation, see the -[parallelism strategies skill](skills/perf-techniques/parallelism-strategies/SKILL.md). - -## Configuration Guidelines - -### Memory Optimization -- Use **distributed optimizer** to reduce optimizer state memory -- Enable **sequence parallelism** when using tensor parallelism to reduce activation memory -- Use **context parallelism** for long sequence training -- Consider **pipeline parallelism** for very large models that don't fit on a single GPU - -### Performance Optimization -- **Tensor parallelism** works best within a single node (high bandwidth) -- **Pipeline parallelism** can work across nodes but requires careful batch size tuning -- **Context parallelism** is essential for long context scenarios -- **Expert parallelism** is specific to MoE models and should match the number of experts -- **DeepEP/HybridEP** provide optimized MoE token dispatching on supported GPU architectures - -### Compatibility -- **Sequence parallelism** requires `tensor_model_parallel_size > 1` -- **Expert parallelism** requires MoE models (`num_moe_experts > 0`) -- **DeepEP** requires Ampere, Hopper, B200, or B300 GPUs -- **HybridEP** requires GB200, GB300 with NVL72, or Ampere, Hopper, B200, B300 GPUs -- **Token dropping** requires `alltoall` or `alltoall_seq` token dispatcher -- All parallelism strategies can be combined, but total parallelism must divide evenly into the world size - -## Related Artifacts - -- **Operational skill**: [skills/perf-techniques/parallelism-strategies/SKILL.md](skills/perf-techniques/parallelism-strategies/SKILL.md) — enablement, pitfalls, memory estimation, verification -- **Knowledge card**: [skills/perf-techniques/parallelism-strategies/card.yaml](skills/perf-techniques/parallelism-strategies/card.yaml) — structured metadata and validation status - -## Resources - -- [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/) -- [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/) -- [Megatron-LM Repository](https://github.com/NVIDIA/Megatron-LM) -- [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/performance-guide.md -```md -# Performance Tuning Guide - -Megatron-Bridge provides a wide range of features for performant and memory-efficient LLM training on GPUs, and comes pre-configured with optimal settings. However, factors such as model architecture, hyperparameters, GPU count, and GPU type can affect the available options, and additional tuning may be necessary to achieve optimal performance. This document explores the factors that affect training performance, highlights common issues, and outlines techniques for performance tuning that lead to higher MFU (Model FLOPS Utilization) and TCO. - -```{Note} -This guide makes references to several configuration settings. These settings will be referenced relative to the the config class that contains them, e.g. `OptimizerConfig.lr`. Please see for more details on configuration settings. -``` - -```{Note} -This guide references several configuration settings from `TransformerConfig`. Please apply these to the appropriate ModelProvider for your model, e.g. `GPTModelProvider`, as the `ConfigContainer` does not accept a raw `TransformerConfig`. -``` - -## Low Precision Training - -1. Expected speedup of FP8 training compared to BF16 training - - > 1. The default low-precision LLM training recipe applies FP8 computation exclusively to the linear layers within the Transformer block, typically achieving a speedup of 1.2–1.5X. - > 2. However, the actual speedup depends on the proportion of training time spent on these linear layers. For instance, smaller LLMs with a limited hidden size exhibit lower FP8 speedup, as linear layers scale with O(sequence_length × hidden_size²) complexity, whereas the other element-wise computation layers (e.g., layer norms, dropouts, RoPE, and simple math functions) scale with O(sequence_length × hidden_size), and dot-product attention scales with O(sequence_length² × hidden_size). Consequently, the contribution of linear layers to the overall training time is smaller in such models. - > 3. Different FP8 recipes use varying quantization block sizes, affecting performance. Smaller quantization blocks generally incur higher overhead in both quantization and GEMM execution. For example, MXFP8 with a 1×32 quantization block performs less efficiently than full tensor-wise FP8 scaling. - -2. Common issues of low FP8 training speedup - - > 1. Host performance boundness when LLM uses small GPU kernels (see [Lowering Host Overhead and Jitters](#lowering-overhead-jitter)). - > 2. A low proportion of linear layers in training step time that use FP8 computation. - -## Parallel Mapping Strategies - -1. Data Parallelism using Distributed Optimizer - - > 1. You should begin with data-parallel (DP) mapping. As long as the model and activation memory fit within the GPUs, data parallelism generally offers optimal performance, minimizes communication overhead, and maximizes per-GPU tensor sizes (compared to per-tensor sharding). - > - > 2. Megatron-Bridge uses the distributed optimizer as the default method for data-parallel training. It shards master parameters and optimizer states across data-parallel ranks, reducing model state memory usage without increasing communication overhead compared to traditional data-parallel training. - > - > > 1. `OptimizerConfig.use_distributed_optimizer=true` - -2. Per-tensor Sharding (Tensor-parallel or Context-parallel mappings) - - > 1. Tensor parallelism (TP) is the primary recommendation when a model exceeds GPU memory capacity under data-parallel mapping. However, since it involves higher communication overhead, the tensor-parallel size should ideally be confined to the high-bandwidth intra-node network (NVLink domain). - > - > > 1. `TransformerConfig.tensor_model_parallel_size=` - > - > 2. When the sequence length in a training run is significantly larger than the hidden size, activation memory can overflow. In such cases, context parallelism (CP) helps by sharding tensors along the sequence dimension, allowing the workload to fit within limited GPU memory and improving performance. Like tensor parallelism (TP), CP requires inter-GPU communication of activations. However, for the same tensor sizes, CP generally results in lower communication volume. - -That said, CP’s effectiveness depends on the relative sizes of the sequence length and hidden size. When the sequence length is smaller than the hidden size, CP produces narrow (or "skinny") tensor shards on each GPU. This reduces data reuse and can degrade performance. - -Additionally, because CP shards activations, it also partitions optimizer states in distributed training. As a result, optimizer state partitioning spans both the data parallel (DP) and context parallel (CP) dimensions. - -> > 1. `TransformerConfig.context_parallel_size=` -> -> 1. Performance tips: -> -> > 1. A large tensor-parallel or context-parallel size is not recommended unless the hidden size or sequence length is large enough to maintain sufficient per-GPU parallelism and avoid excessive communication overhead. For example, using a tensor-parallel size of 8 for LLAMA 3 70B could lead to low GPU utilization and make training host-performance bound. -> > 2. You can combine TP and CP to optimize performance by balancing communication overhead. For example, using TP=2 along with CP=2 can give better performance than TP=4 when the sequence size is larger than the hidden size. -> > 3. For additional tips, see [Long Sequence Training](#long-sequence-train). - -1. Pipeline Parallelism - - > 1. Pipeline parallelism (PP) is necessary when a model cannot fit within GPU memory using tensor parallelism. Also, virtual pipeline parallelism (VPP) should be used in conjunction with pipeline parallelism to reduce the overhead caused by pipeline warm-up and flush bubbles. - > - > > 1. `TransformerConfig.pipeline_model_parallel_size=` - > > 2. `TransformerConfig.virtual_pipeline_model_parallel_size=` - > - > 2. Performance tips in PP and VPP sizing: - > - > > 1. PP can also be combined with per-tensor sharding methods to mitigate the impact of sharding inefficiencies and pipeline bubbles. For instance, TP4 + PP2 may outperform TP8 when both mappings fit into memory because using a large TP reduces per-GPU tensor sizes but increases the communication cost, increasing the exposed communication. - > > 2. VPP increases inter-stage communication overhead. When a global batch contains many micro-batches, using a smaller VPP size can improve performance, as the exposed communication cost outweighs the reduction in pipeline bubbles. - > - > 3. Asymmetric Transformer layer allocation across pipeline stages - > - > > 1. An LLM with a large vocabulary size has computationally heavy embedding lookup and projection operations, leading to load imbalance across pipeline stages. To address this, Megatron-Bridge provides an option to allocate one fewer Transformer layer in the first and last pipeline stages, which handle embedding lookup and projection, to better balance workloads. - > > - > > > 1. `GPTProvider.account_for_embedding_in_pipeline_split=true` - > > > 2. `GPTProvider.account_for_loss_in_pipeline_split=true` - -2. Expert Parallelism - - > 1. Expert Parallelism (EP) is designed specifically for Mixture-of-Experts (MoE) models to efficiently distribute sparse MLP weights across multiple chips. It can be used in combination with other parallelism strategies such as Tensor Parallelism (TP), Context Parallelism (CP), Pipeline Parallelism (PP), Data Parallelism (DP), and Fully Sharded Data Parallel (FSDP). In the current design, the dense attention part and the sparse MLP part are fully decoupled in terms of their TP, CP, and DP parallelism configurations. Expert Tensor Parallelism (ETP) is introduced to specifically control the tensor parallelism for the sparse MLP part. ETP uses TP for dense layers for the ranks allocated for EP in sparse layers. On the other hand, the baseline is DEP, which folds DP in dense layers for EP in sparse layers. - > - > > 1. `TransformerConfig.expert_model_parallel_size=` - > > 2. `TransformerConfig.expert_tensor_parallel_size=` - > - > 2. Performance tips in hybrid folding options and EP sizing: - > - > > 1. Typically, EP is kept within the high-bandwidth intra-node network (NVLink domain) to minimize the communication overhead it can introduce. However, using communication overlap techniques—such as pipeline overlap or 1F1B overlap—along with PP (e.g., DualPipe) might make it possible to expand EP into the inter-node networks. - > > - > > 2. Within the sparse MLP block, DP replaces CP because it has no impact on the computation pattern based on the dispatched tokens in each EP rank. - > > - > > 3. Usually, ETP is set to 1 to avoid significant communication overhead that comes with applying TP to MLP GEMMs. - > > - > > 4. When multiple experts are placed on a single chip after applying Expert Parallelism, enabling grouped GEMM can significantly improve computation efficiency. - > > - > > > 1. `TransformerConfig.moe_grouped_gemm=True` - -3. Fully Sharded Data Parallelism - - > 1. Megatron-Bridge supports PyTorch-native FSDP. FSDP can be used in combination with per-tensor sharding methods. - > - > > 1. To use PyTorch FSDP2: - > > - > > > 1. `DistributedInitConfig.use_torch_fsdp2=True` - > - > 2. FSDP can be preferred over TP+PP+DP mappings in the following scenarios: - > - > > 1. Small models with a large sequence, thus the parameter AllGather and gradient ReduceScatter can effectively be hidden under computation and the short communication overlap causes minor interference to the computation under overlap. - > > 2. In FSDP training, activation storage remains as the main memory bottleneck because FSDP only shards model state memory, and a large per-GPU activation is needed to hide the costly FSDP communication. On GB200 GPUs, Megatron-Bridge offers an option to offload activations to the host memory via a high-speed chip-to-chip interconnect. - > > 3. Baseline training is host performance-bound, but FSDP allows for larger per-GPU tensor sizes by eliminating TP or enabling a larger micro-batch size. - - - - - - - - - - - - - - - - - - - -4. Heterogeneous Encoder Parallelism - - > 1. Encoder Pipeline Parallel - > - > > 1. Use `T5ModelProvider.encoder_pipeline_model_parallel_size`. - > > 2. In an Encoder-Decoder architecture like Multimodal models (VLMs like NeVA etc.), Encoder Pipeline Parallel can be used to add pipeline parallelism to the encoder. - > > 3. Pipeline parallelism controls the amount of pipelining in the decoder part. - > > 4. Encoder Pipeline Parallel is limited to 1 at the moment, i.e., the encoder can occupy a maximum of 1 PP stage. - > > 5. By default, Encoder Pipeline Parallel is 0 and Decoder Pipeline Parallel is 1. - > > 6. When the Encoder Pipeline Parallel size is 0, it shares the first PP stage of the Decoder. - > - > 2. Encoder Tensor Parallel - > - > > 1. Use `T5ModelProvider.encoder_tensor_model_parallel_size`. - > > 2. Since encoders tend to be much smaller than decoders, we also provide the ability to set a different amount of tensor parallelism to the encoder than the decoder. - > > 3. By default, encoder tensor parallel is set to 0, i.e., the amount of tensor parallelism in the encoder is equal to tensor parallelism in the decoder. - > > 4. To use this option, Encoder Pipeline Parallel must be greater than 0 as we need the encoder to be on its own pipeline stage. - > > 5. Encoder Tensor Parallel size is limited to be less than or equal to Tensor parallel size. - > - > 3. Total number of GPUs required when these features are used is: - > - > > 1. Data Parallel size * Context Parallel size * ((Encoder TP * Encoder PP) + (Decoder TP * Decoder PP)) - > - > 4. These features are experimental and may still have bugs. There are critical bug fixes that will be made in a future release. - -5. Parallel mapping strategies with NVL72 - - > 1. Training with only data parallelism or FSDP makes it straightforward to fully utilize the bandwidth of an NVL72 system. However, when combining multiple parallelism strategies, it's important to ensure that high-volume communicators remain confined within each NVL72 domain. For example, with TP=4, DP=16, and PP=4, the GPUs in the first TP group of DP1/PP1 spans both NVLink and network domains, causing communication performance to be bottlenecked by the slower network link. To avoid this, you may choose TP and DP sizes such that the product of TP × DP divides evenly into the NVL72 configuration. If the model-parallel size does not align naturally, padding may be required to support non-divisible group sizes. - > 2. To avoid this partitioning complexity, you can just use 64 GPUs out of the 72 GPUs. - -## Communication Overlaps and Tuning - -1. Data-parallel communication of Distributed Optimizer - - > 1. Distributed optimizer overlaps parameter AllGathers with the forward computation of the first micro-batch and gradient ReduceScatters with the backward computation of the last micro-batch. - > - > > 1. `DistributedDataParallelConfig.overlap_param_gather=true` - > > 2. `DistributedDataParallelConfig.overlap_grad_reduce=true` - > - > 2. When using the distributed optimizer with pipeline parallelism (PP) + virtual pipeline parallelism (VPP), DP communications overlap with multiple micro-batches, increasing the opportunity for effective overlap. Also, Megatron-Bridge aligns the execution timing of DP communications across pipeline-parallel ranks to synchronize the computing kernel slowdown from the overlap. - > - > > 1. `DistributedDataParallelConfig.align_param_gather=true` - > - > 3. Slow DP communication at large scaling training: - > - > > 1. Distributing optimizer states across a partial DP domain reduces communication costs over high-latency Ethernet networks. Model states remain replicated outside the distributed domain. During the final micro-batch backpropagation, gradient ReduceScatters occur within the distributed domain, followed by AllReduce in the non-distributed domain. Parameter AllGathers are performed only within the distributed domain. - > > - > > > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances= ` - > > - > > 2. A large message size for DP communication is recommended to maximize network bandwidth utilization. You can achieve this by increasing the communication bucket size. - > > - > > > 1. `DistributedDataParallelConfig.bucket_size=` - > - > 4. A common reason for DP communication overlap failure: - > - > > 1. Persistent Layer Normalization (LN) kernels from Transformer Engine use spin-waiting for all SMs in the GPU, causing the LN kernel and subsequent computation kernels to be scheduled only after DP communication. To prevent this, an appropriate SM margin should be configured using the following environment variables. - > > - > > > 1. `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>` - > > > 2. `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>` - - - - - -3. Tensor-parallel (TP) communication (with sequence parallelism) - - > 1. Megatron-Bridge currently uses the userbuffer backend in Transformer Engine for TP communication overlaps. This offers the pipelined overlap of the TP communication with dependent computation. - > - > > 1. `CommOverlapConfig.tp_comm_overlap` - > - > 2. The overlap method, resource, and precision of the TP communication overlaps are configurable, and the most performant configurations are set in the Megatron-Bridge training recipes by default. Also, you can set a custom TP communication overlap configuration via the below interface following the structure of TransformerLayerTPOverlapCfg class. - > - > > 1. `CommOverlapConfig.tp_comm_overlap_cfg=` - > - > 3. TP communication overlap setting tips - > - > > 1. Balancing the number of SMs between communication and GEMM - > > - > > > 1. For AllGather/ReduceScatter bulk and ReduceScatter pipelined overlap, you can adjust the number of SMs to balance communication and GEMM execution. Allocating too many SMs to communication may degrade GEMM performance, while too few may expose communication overhead. The default SM allocation for communication is 16, but you can fine-tune it based on profiling results. - > > > 2. `TPOverlapCfg.num_sm=` - > > - > > 2. CGA sizing to improve SM utilization - > > - > > > 1. The CGA size can be set between 1 and 4, but it should not exceed the number of SMs allocated for communication. We recommend using CGA ≤ 2 to prevent potential SM rasterization that could impact GEMM performance. - > > > 2. `TPOverlapCfg.cga_size=` - > > - > > 3. Use 4× splits for ReduceScatter and GEMM overlap to optimize the balance between GEMM efficiency and communication exposure. - > > - > > > 1. In GEMM-then-ReduceScatter pipeline overlap, a 1× ReduceScatter chunk remains exposed. A small split size increases communication exposure, while a large split size may degrade performance due to aggregated GEMM wave quantization. We find that num_splits = 4 generally provides the best performance. - > > > 2. `TPOverlapCfg.num_split=` - > - > 4. Common reason for TP comm overlap failure at Hopper - > - > > 1. At H100 GPU, an environment variable `CUDA_DEVICE_MAX_CONNECTIONS=1` should be set. Otherwise, TP communication kernels can be scheduled at the end of GEMM to overlap with. - > > 2. Pipelined TP communication overlap is used by a static userbuffer registered upon model initialization. Therefore, it doesn't support activation tensors dynamically changing between steps or between Transformer layers. - -4. Context-parallel (CP) communication - - > 1. CP communication is configurable via "cp_comm_type", which can be "p2p", "all_gather", "a2a", or "a2a+p2p". Communications of "p2p" are implemented as ring-exchange send/receive operations, and they are hard-coded to overlap with the attention compute of sequence chunks. See [Long Sequence Training](#long-sequence-train) for more details. - -5. Expert-parallel communication - - > 1. To hide the A2A/AG communication introduced by EP, pipeline split overlap or 1F1B overlap alongside Pipeline Parallelism could be possible. It will be added to Megatron-Bridge in future releases. - -6. Pipeline-parallel (PP) send/receive communication - - > 1. PP send/recv in steady 1F1B states are set to be overlapped with computes by default. - > 2. The PP send/recv in warmup and flush are exposed by default. - -(comm-data-types)= -## Communication Data Types - -1. FP8 data-parallel parameter AllGather in Distributed Optimizer and FSDP - - > 1. Megatron-Bridge supports FP8 parameter AllGather for per-tensor FP8 scaling recipes. This operation is lossless, enhancing performance while reducing memory usage. - > - > > 1. `MixedPrecisionConfig.fp8_param=true` - -2. BF16 (instead of FP32) data-parallel reduction in Distributed Optimizer and FSDP - - > 1. We have validated that BF16 reduction is numerically safe across numerous model training runs. However, BF16 reduction with a large data-parallel size (e.g., DP ≥ 128), especially the Ring reduction algorithm—which accumulates copies sequentially—may impact numerical stability. When using SHARP with NVIDIA InfiniBand, BF16 reduction is more robust, as it performs binary additions with higher precision for intermediate partial reductions. - > - > > 1. `DistributedDataParallelConfig.grad_reduce_in_fp32=false` - -3. FP8 tensor-parallel ReduceScatter - - > 1. When communication latency exceeds GEMM execution time, using FP8 input ReduceScatter can better hide communication overhead. This approach has low numerical impact, as the GEMM output must be cast to FP8 and then converted back to high precision during reduction. - > - > > 1. `TPOverlapCfg.fp8_buf=true` - -4. FP8 A2A Dispatch for expert parallel communication - - > 1. Megatron-Bridge is working on supporting FP8 A2A dispatch (before expert FC1), but still keeps BF16 A2A combine (after expert FC2). - -## Performance at Scale - -1. Scaling a training job is typically achieved by increasing the size of the data-parallel domain. In large-scale training, this often results in a small number of micro-batches per global batch—or even a single micro-batch—causing most computations to overlap with data-parallel communication. To maintain high performance in such scenarios, you should focus on minimizing the overhead of data-parallel communication and reducing host-driven inter-GPU jitter. - -2. You can lower the overhead of data-parallel communication by (1) reducing the communication precision e.g., BF16 for gradient reduction and FP8 parameter gathering, (2) improving the efficiency of communication by increasing the data-parallel communication message size or using the hierarchical data-parallel reduction, or (3) using multi-cast and switch reduction with SHARP in case of InfiniBand network. - - > 1. Using BF16 gradient reduction and FP8 parameter gather are described in [Communication Data Types](#comm-data-types) - > - > 2. For non-pipeline-parallel training, the data-parallel communication bucket size can be adjusted using the knobs below. In pipeline-parallel training, however, the bucket size is fixed and determined by the number of parameters assigned to each virtual pipeline rank. - > - > > 1. `DistributedDataParallelConfig.bucket_size=` - > - > 3. Setting the knob below splits the data-parallel domain of the distributed optimizer into a sharding domain and a replication domain. Gradient reduction then occurs in two stages—one within each domain—avoiding the use of a single large flat ring for collective operations that have high latency. - > - > > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances=` - -3. Ideas to reduce the host-driven inter-GPU jitters are discussed in [Lowering Host Overhead and Jitters](#lowering-overhead-jitter). - -(lowering-overhead-jitter)= -## Lowering Host Overhead and Jitters - -1. Common observation associated with host overhead - - > 1. Significantly low GPU FLOPS. - > 2. Small performance gain of low-precision (FP8) training. - > 3. Small LLMs with small hidden size or sequence length or fine-tuning without sequence packing - > 4. High multi-GPU communication variation. - -2. Increasing micro-batch size and reduce per-tensor sharding - - > 1. The most common way to increase per-GPU tensor size is by increasing the micro-batch size or minimizing unnecessary per-tensor sharding (e.g., TP or CP) when GPU memory permits. - -3. Manual garbage collection to align the host interruption across GPUs - - > 1. Megatron-Bridge manually aligns the timing of garbage collection across GPUs that significantly mitigate the host overhead compared to the baseline automatic garbage collection. - > - > > 1. `TrainingConfig.manual_gc_interval=` - -4. CUDA graph to eliminate repeated static host code execution - - > 1. Megatron-Bridge supports graph capture, significantly reducing host overhead. CUDA Graph is applicable only to LLMs with a static tensor shape across training steps. For example, it supports fixed-size packed sequences but does not handle sequences with varying lengths at each step. Also, MoE models with token-dropless propagation have limited CUDA graph support, restricted to the dense modules only. - > 2. CUDA graph requires additional memory for static buffer management, typically adding a few gigabytes for static buffers, while models with PP size > 1 may consume over 10GB. We are actively working to reduce this memory overhead. - > 3. See [CUDA Graphs](training/cuda-graphs.md) for configuration details (`cuda_graph_impl`, `cuda_graph_scope`). - -5. Bind CPU memory for GPU processes - - > 1. Binding CPU cores to GPU processes helps mitigate long latency issues and ensures minimal variation in GPU queuing latency across GPUs. This optimization significantly impacts, particularly when the communication domain size is large. - > 2. Example command line for a X86-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/4)) --membind=$((SLURM_LOCALID/4)) ` - > 3. Example command line for a Grace-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/2)) --membind=$((SLURM_LOCALID/2)) ` - -(reducing-memory-overflow)= -## Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency - -1. Activation recomputation - - > 1. Megatron-Bridge LLMs default to dot-product attention-only recomputation using Flash Attention, efficiently regenerating large intermediate activations from the attention operation with minimal computational overhead. - > - > 2. Megatron-Bridge also supports recomputing the full intermediate activations of a Transformer block, significantly reducing activation memory usage at the cost of approximately 30% additional computation. The number of Transformer blocks to recompute can be adjusted using a configurable setting. - > - > > 1. `TransformerConfig.recompute_granuality=full` - > > 2. `TransformerConfig.recompute_method=block` - > > 3. `TransformerConfig.recompute_num_layers=` - -2. Activation offloading to host memory - - > 1. Megatron-Bridge supports offloading activation memory to host memory, essential for training tasks constrained by activation memory. This is particularly useful for scenarios like (1) FSDP, where model state memory is minimized through sharding but activation memory remains high, (2) LoRA, which has frozen parameters but significant activation memory demands, and (3) the training with a large sequence length. The efficiency of activation offloading depends on both the interconnect bandwidth between the GPU and host and the host memory bandwidth. From this perspective, Grace-based systems like the GB200 enhance offloading performance by optimizing these bandwidths. - > - > 2. The following knobs should be configured to enable offloading and specify the number of Transformer layers to offload to host memory. The maximum number of layers that can be offloaded depends on host memory capacity, which may be lower when the CPU is shared among multiple GPUs. - > - > > 1. `TransformerConfig.cpu_offloading=True` - > > 2. `TransformerConfig.cpu_offloading_weights=False` - > > 3. `TransformerConfig.cpu_offloading_num_layers= ` - > - > 3. Environment variable settings to avoid resource conflict between CPU memory offloading and network communication - > - > > 1. `NCCL_NET_GDR_LEVEL=PHB # NCCL <=2.25` - > > 2. `NCCL_NET_GDR_C2C=1 # NCCL >=2.26` - > - > 4. Optimization tips - > - > > 1. Given the ratio between activation volume and computational operations, offloading all layer activations naively can become a performance bottleneck. Optimizing performance requires tuning the number of layers to offload while balancing it with recomputation. - -3. Weight memory-optimized BF16 training - - > 1. In BF16 training, Megatron-Bridge optimizes memory usage by storing only the BF16 remainder of the master weight copies for the next optimizer update. This is possible because BF16 data can be represented using a subset of FP32 bits, allowing Megatron-Bridge to avoid redundant storage of the FP32 portion used for BF16 representation. This is default enabled when using precision-aware optimizer in Megatron Core. - > - > > 1. `OptimizerConfig.use_precision_aware_optimizer=True` - -4. Common memory usage hikes from environment variable setting - - > 1. The below environment variables will (1) avoid preserving the buffers for NCCL communication and (2) disable NVLSharp when not used. Both these options lower the GPU memory usage. - > - > > 1. `TORCH_NCCL_AVOID_RECORD_STREAMS=1` - > > 2. `NCCL_NVLS_ENABLE=0` - > - > 2. While not enabled by default, you can further reduce memory usage caused by segmentation penalties by setting the env var shown below. - > - > > 1. `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` - -5. Keep parameters in FP8 at FP8 training - - > 1. In FP8 training, after optimizer step execution, we can keep the parameters in FP8. Compared to the baseline that keeps the intermediate weight values in BF16, FP8 parameters lower memory usage and improve communication performance. The below knob enables keeping the parameters in FP8. - > - > > 1. `MixedPrecisionConfig.fp8_param_gather=True` - -## Operator Fusion - -1. You can control specific fusion behaviors using the following configuration knobs: - - > 1. `TransformerConfig.masked_softmax_fusion=true` - > 2. `GPTProvider.cross_entropy_loss_fusion=true` - > 3. `GPTProvider.gradient_accumulation_fusion=true` - > 4. `TransformerConfig.bias_activation_fusion=true` - > 5. `TransformerConfig.bias_dropout_fusion=true` - > 6. `TransformerConfig.apply_rope_fusion=true` - -2. Megatron-Bridge offers different Flash Attention options, which can be chosen through the model config: - - > 1. Let Transformer Engine decide (default): `TransformerConfig.attention_backend=AttnBackend.auto` - > 2. FlashAttention2: `TransformerConfig.attention_backend=AttnBackend.flash` - > 3. cuDNN fused attention: `TransformerConfig.attention_backend=AttnBackend.fused` - -(long-sequence-train)= -## Long Sequence Training - -1. Problem of long sequence training - - > 1. Training with long sequence length can lead to memory overflow due to the huge memory cost of activations. The problem could be solved by recomputing activations in backward, but it can impose up to ~30% overheads in each training step. Context parallelism is a better solution which splits the sequence dimension across multiple GPUs, so that each GPU only computes and saves activations of a sequence chunk. In this way, memory overflow is addressed without introducing any redundant compute. - -2. CP to shard activation (knob) - - > 1. `TransformerConfig.context_parallel_size=` - > - > > 1. Both TP and CP can reduce activation memory overheads. It's not wise to be biased to either of them. Communications of TP and CP are overlapped by GEMM and Attention respectively. Blindly enlarging their sizes can make some communications hard to overlap. It's recommended to sweep a combination of TP+CP configs. The optimal config is expected to make full use of all related compute and do best overlapping, thereby achieving best end-to-end performance. - > - > 2. `TransformerConfig.cp_comm_type= or ` - > - > > 1. Megatron-Core provides multiple implementation variants of CP and allows you to make choices based on your specific use cases by configuring "cp_comm_type". The configuration value can be `p2p`, `all_gather`, `a2a`, or `a2a+p2p`. These communication types are compatible with each other, so they can be flexibly interleaved between transformer layers. You only need to provide a list, where each element corresponds to a layer. - > > 2. `p2p`: exchanges KV sequence chunks in ring-topology. The P2P communications can be fully overlapped. - > > 3. `all_gather`: inserts an all-gather before attention to get a full sequence of KV. The all-gather is exposed, but it should not impose big overheads if GQA/MQA are used, as they have very few KV heads. - > > 4. `a2a`: is an implementation of DeepSpeed Ulysses. A2A communications are added before and after the attention module to gather full sequence length and further scatter heads in CP domain. A2A cannot be overlapped. - > > 5. `a2a+p2p`: is a middle ground between `a2a` and `p2p`. This is useful for cases of big CP sizes, where each sequence chunk is too short to overlap P2P communications. It first does A2A in partial CP groups to gather relatively longer sequence chunks, then applies P2P implementation to the gathered chunks. It also can be helpful for hierarchical CP communications, for example A2A and P2P happen in NVLink and IBLink domains respectively. - > > 6. With small and medium CP size, `p2p` is the recommended configuration because communications can be fully overlapped; "all_gather" also should work fine with GQA/MQA. As for strongly-scaling a sequence length with big CP sizes, the short chunk length can barely overlap the `p2p` communications, so `a2a+p2p` ought to be the preferred choice. `a2a` could be adopted in some cases for its simplicity. However, CP size can be restricted with "a2a" because it requires the number of attention heads to be divisible by CP size. Restricted CP size will finally limit the sequence length that can be run. - -3. Activation recomputation (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow)) - -4. Activation offloading to host memory (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow)) - -## Sequence Packing for Performant Fine-Tuning - -1. Dataset preparation - - > 1. Fine-tuning datasets with shorter sequences of variable length can be packed into longer sequences, up to a set maximum length, for best efficiency. - -2. To use this feature, the microbatch size must be set to 1. In place of increasing the micro batch size, the maximum sequence length can be increased, which will effectively increase the number of individual sequences per packed sequence. - -3. Enabled with: - - > 1. `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size=` - > 2. `TrainingConfig.micro_batch_size=1` - -4. Performance benefits also include: - - > 1. Inconsistent lengths between sequences in the fine-tuning dataset would reduce the computation efficiency. With a micro-batch size over 1, all sequences must be padded with empty tokens to the length of the longest one in the micro-batch. Similarly, some optimizations like CUDA graphs require uniform sequence lengths between micro-batches. Packed sequences are arranged so that the total number of tokens per packed sequence is as close to the maximum length as possible, making most processed tokens useful. - > 2. Likewise, when using data parallel, variance in time needed to process different batches can result in all batches needing to wait for the longest to finish-- and this variance is reduced with packed sequence. - -## GPU Core Clock Optimization - -1. Increase the clock ratio of GPU core over off-chip memory system - - > 1. NVIDIA GPUs support a CPU core clock boost mode, which increases the core clock rate by reducing the off-chip memory clock rate. This is particularly beneficial for LLMs, which are typically compute throughput-bound. - > - > > 1. `sudo nvidia-smi boost-slider --vboost 1 ` - -## Profiling Options for Analysis-based Performance Tuning - -1. Nsight system profile - - > 1. Megatron-Bridge provides an interface to enable the NVIDIA Nsight Systems profiler, which displays the GPU execution trace of all CUDA streams. You can check whether communication kernels overlap with computation kernels and adjust resource allocation to balance communication and computation. The Nsight Systems profile can be enabled using ProfilingConfig, as shown below. - > 2. `ProfilingConfig(use_nsys_profiler=True, profile_start_step=, profile_end_step=, profile_ranks=<[0,...]>)` - -2. Memory snapshot - - > 1. Megatron-Bridge provides an interface to extract the memory snapshot that shows the memory allocation bytes, the allocation lifespan, and the function call stack. Extracting the memory snapshot can be enabled by ProfilingConfig as shown below. - > 2. `ProfilingConfig(record_memory_history=True, memory_snapshot_path=)` - -## DeepEP: Common Issues and Solutions - -DeepEP is a communication library optimized for Mixture-of-Experts (MoE) all-to-all operations. When using DeepEP for cross-node Expert Parallelism (EP), there are several common issues related to network transport and GPU-NIC affinity that can significantly impact performance. - -> Note: DeepEP is best optimized for NVL8 systems such as the DGX-B200 NVL8 or DGX-H200 NVL8. For GB200 NVL72 rack-scale systems, where 72 GPUs are interconnected within the same NVLINK domain, we recommend using [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) instead of DeepEP. HybridEP is maintained by NVIDIA and is specifically optimized for NVL72 rack scale systems. It is also integrated into the Megatron-core [fused all-to-all module](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.transformer.moe.fused_a2a.html) as an alternative backend under the `flex` token dispatcher. -> -> Learn more about GB200 MoE training best practices [here](https://github.com/NVIDIA/Megatron-LM/blob/dev/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md). - -### 1. Why is my DeepEP not working - -1. What is IBGDA and why is it a problem - - DeepEP achieves optimal cross-node communication performance using InfiniBand GPU Direct Async (IBGDA), which is supported by ConnectX NICs in both InfiniBand and RoCEv2 modes. However, IBGDA is not always enabled by default—it often requires cluster administrators to actively configure the system and enable GPU Direct RDMA support in the InfiniBand (or RoCEv2) fabric. If this configuration step is skipped or unsupported in the cluster environment, IBGDA may be unavailable, which can prevent DeepEP inter-node EP capability from functioning. - -1. Network Transport: IBGDA vs. IBRC - - > 1. IBGDA (InfiniBand GPU Direct Async) requires cluster administrators to enable GPU Direct RDMA and configure the InfiniBand subsystem. Many clusters do not have IBGDA enabled by default. - > 2. The official DeepEP main branch has removed support for IBRC (InfiniBand Reliable Connection), which previously served as a fallback mechanism. With IBRC, a CPU proxy thread will assist in processing the EP communication, which might have performance degradation compared to IBGDA, but we find such performance degradation doesn't overshadow the benefit of enabling wideEP in production training. - -2. Solution: NVSHMEM 3.5 with Automatic Transport Fallback - - > 1. NVSHMEM 3.5 introduces improved auto-fallback support for cross-node communication under various network configurations. It can automatically select the best available transport (IBGDA, IBRC, or other supported mechanisms) based on cluster capabilities. - > 2. To benefit from NVSHMEM’s auto-fallback in DeepEP: - > - Download the [official NVSHMEM 3.5.19-1 release](https://github.com/NVIDIA/nvshmem/releases/tag/v3.5.19-1). You can also choose to compile it from source in your container environment; we provide such examples later in this guide. - > - Switch to the [DeepEP branch with native NVSHMEM API integration](https://github.com/seth-howell/DeepEP/tree/nvshmem_native_apis). This branch enables automatic use of NVSHMEM’s fallback mechanisms without requiring any manual code modifications. - -### 2. GPU-NIC Affinity and Bandwidth Contention - -A common cause of poor DeepEP performance is incorrect GPU-to-NIC (Network Interface Card) affinity, where multiple GPUs compete for bandwidth on a single NIC. As noted in [DeepEP PR #466](https://github.com/deepseek-ai/DeepEP/pull/466), cross-node EP performance may degrade if multiple GPUs use the same NIC, due to certain GPU-NIC affinity in some clusters. This PR provides a solution by supporting the environment variable `DEEP_EP_DEVICE_TO_HCA_MAPPING` to specify GPU-to-NIC mappings so that each GPU is automatically bound to the optimal NIC for maximum DeepEP throughput. - -With this PR's solution, we need the following environment variables to map GPUs to NICs correctly. First, you need to find out the names of the NICs by running `ibstat`. In our example, we found the following for one RoCEv2 DGX-B200 cluster: -``` -> ibstat | grep ^CA -CA 'rocep145s0' -CA 'rocep146s0' -CA 'rocep152s0' -CA 'rocep153s0' -CA 'rocep198s0' -CA 'rocep199s0' -CA 'rocep205s0' -CA 'rocep206s0' -``` - -Use the following environment variables to map GPUs to NICs. Note that `0:rocep145s0:1` is formatted as `::` so that each GPU will only be mapped to one dedicated NIC. -```bash -export NVSHMEM_ENABLE_NIC_PE_MAPPING=1 -export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1" -``` - -### 3. Build DeepEP - -In this section, we provide a reference Dockerfile that shows how to build NVSHMEM 3.5 and the customized DeepEP into your container environment. - -Note that the following example is provided for DGX-B200 NVL8 systems, but similar ideas apply to Hopper generation as well—just change the Dockerfile accordingly. For example, you just need to change the compile target for SM90. - -Key points: - -- NVSHMEM source: https://github.com/NVIDIA/nvshmem/tree/v3.5.19-1 -- DeepEP branch that we cherry-picked with all the fixes above: https://github.com/zhongbozhu/DeepEP/tree/nvshmem_deepep_gcp -- Example training container template for DGX-B200: https://github.com/yanring/Megatron-MoE-ModelZoo/blob/main/dockers/B200.Dockerfile - -**Dockerfile** -```bash -FROM nvcr.io/nvidia/pytorch:25.11-py3 as base - -# Other dependencie you may want -... - -# Dependency of IBGDA -RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -# Clone DeepEP customized version -WORKDIR /home/dpsk_a2a -RUN git clone https://github.com/zhongbozhu/DeepEP.git ./deepep -RUN cd ./deepep && git checkout nvshmem_deepep_gcp && cd /home/dpsk_a2a - -# Clone NVSHMEM 3.5 https://github.com/NVIDIA/nvshmem -RUN git clone --branch v3.5.19-1 https://github.com/NVIDIA/nvshmem.git ./deepep-nvshmem -RUN cd ./deepep-nvshmem && git checkout v3.5.19-1 && cd /home/dpsk_a2a - -# Build nvshmem from source -# You can also download the pre-built binary, and skip the following -RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y \ - clang \ - llvm-dev \ - libclang-dev && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /home/dpsk_a2a/deepep-nvshmem -RUN mkdir -p build && mkdir -p install && \ - cmake -S . -B build \ - -DCMAKE_INSTALL_PREFIX=/home/dpsk_a2a/deepep-nvshmem/install \ - -DCUDA_HOME=/usr/local/cuda \ - -DMPI_HOME=/opt/hpcx/ompi \ - -DMPI_C_COMPILER=/opt/hpcx/ompi/bin/mpicc \ - -DMPI_CXX_COMPILER=/opt/hpcx/ompi/bin/mpicxx \ - -DNVSHMEM_MPI_SUPPORT=OFF \ - -DNVSHMEM_IBRC_SUPPORT=ON \ - -DNVSHMEM_IBGDA_SUPPORT=ON \ - -DNVSHMEM_IBDEVX_SUPPORT=OFF \ - -DNVSHMEM_UCX_SUPPORT=OFF \ - -DNVSHMEM_SHMEM_SUPPORT=OFF \ - -DNVSHMEM_PMIX_SUPPORT=OFF \ - -DNVSHMEM_USE_NCCL=OFF \ - -DNVSHMEM_USE_GDRCOPY=ON \ - -DGDRCOPY_HOME=/usr \ - -DNVSHMEM_USE_MLX5DV=ON \ - -DNVSHMEM_BUILD_TESTS=ON \ - -DNVSHMEM_BUILD_EXAMPLES=ON \ - -DNVSHMEM_BUILD_PYTHON_LIB=OFF \ - -DNVSHMEM_BUILD_BITCODE_LIBRARY=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CUDA_ARCHITECTURES="100" && \ - cmake --build build -j && \ - cmake --install build - -ENV NVSHMEM_DIR=/home/dpsk_a2a/deepep-nvshmem/install -ENV LD_LIBRARY_PATH=${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH -ENV PATH=${NVSHMEM_DIR}/bin:$PATH - -## Build deepep -WORKDIR /home/dpsk_a2a/deepep -ENV TORCH_CUDA_ARCH_LIST="10.0" -ENV PIP_NO_BUILD_ISOLATION=1 -ENV CPATH=${CUDA_HOME}/include/cccl:$CPATH -RUN pip install --no-build-isolation . - -``` - -DeepEP provides `test_internode.py` to test and benchmark cross-node EP communication. In our experiment, when using 4 nodes of DGX-B200 (i.e., EP32), the achieved throughput for cross-EP is about 50 GB/s with IBRC. We provide an example SLURM script below for running such a test with DeepEP. - -In another experiment on the same cluster, with IBGDA enabled by the cluster admin, we observed approximately 10% higher inter-node performance—roughly 55 GB/s. To enable IBGDA, you need to set the environment variable `export NVSHMEM_IB_ENABLE_IBGDA=true`; there is no need to change the software version or container, because with the software provided above, both modes will work. - -```bash -srun --account= -N 4 -p batch --time 30 \ - --ntasks-per-node=1 --gpus-per-node=8 \ - --no-container-mount-home --container-mounts "/lustre:/lustre" \ - --container-image \ - --mpi=none --export=ALL \ - bash -lc ' -set -eo pipefail - -# Env Var for GPU-NIC mapping -export NVSHMEM_ENABLE_NIC_PE_MAPPING=1 -export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1" - - -# 1) Expand SLURM_JOB_NODELIST and grab the first hostname -headnode=$(python - </dev/null 2>&1; then - master_ip=$(getent ahostsv4 "$headnode" | awk "{print \$1; exit}") -else - master_ip="" -fi -MASTER_ADDR="${master_ip:-$headnode}" - -# 3) Export rendezvous env that matches test_internode.py expectations -export MASTER_ADDR -export MASTER_PORT=${MASTER_PORT:-29500} -export WORLD_SIZE=${SLURM_NNODES:-2} # number of nodes -export RANK=${SLURM_NODEID:-0} # 0..N-1 per node - -export OMP_NUM_THREADS=1 -python -u /home/dpsk_a2a/deepep/tests/test_internode.py -' - -``` - - - - - - - - - - -## Index - List of Tuning Knobs - -- `CommOverlapConfig.tp_comm_overlap` -- `CommOverlapConfig.tp_comm_overlap_cfg` -- `CUDA_DEVICE_MAX_CONNECTIONS` -- `TrainingConfig.manual_gc_interval` -- `MixedPrecisionConfig.fp8_param` -- `ProfilingConfig` -- `NCCL_NET_GDR_C2C` -- `NCCL_NET_GDR_LEVEL` -- `NCCL_NVLS_ENABLE` -- `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives` -- `TransformerConfig.attention_backend` -- `AttnBackend` -- `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives` -- `PYTORCH_CUDA_ALLOC_CONF` -- `TrainingConfig.micro_batch_size` -- `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size` -- `TransformerConfig.apply_rope_fusion` -- `TransformerConfig.bias_activation_fusion` -- `TransformerConfig.bias_dropout_fusion` -- `TransformerConfig.cp_comm_type` -- `TransformerConfig.cpu_offloading` -- `TransformerConfig.cpu_offloading_num_layers` -- `TransformerConfig.cpu_offloading_weights` -- `GPTProvider.cross_entropy_loss_fusion` -- `TransformerConfig.cuda_graph_impl` / `cuda_graph_scope` (see [CUDA Graphs](training/cuda-graphs.md)) -- `MixedPrecisionConfig.fp8_param_gather` -- `GPTProvider.gradient_accumulation_fusion` -- `TransformerConfig.masked_softmax_fusion` -- `TransformerConfig.recompute_granuality` -- `TransformerConfig.recompute_method` -- `TransformerConfig.recompute_num_layers` -- `OptimizerConfig.use_precision_aware_optimizer` -- `GPTProvider.account_for_embedding_in_pipeline_split` -- `GPTProvider.account_for_loss_in_pipeline_split` -- `TransformerConfig.context_parallel_size` -- `DistributedDataParallelConfig.align_param_gather` -- `DistributedDataParallelConfig.bucket_size` -- `DistributedDataParallelConfig.bucket_size` -- `DistributedDataParallelConfig.data_parallel_sharding_strategy` -- `DistributedDataParallelConfig.grad_reduce_in_fp32` -- `DistributedDataParallelConfig.num_distributed_optimizer_instances` -- `DistributedDataParallelConfig.overlap_grad_reduce` -- `DistributedDataParallelConfig.overlap_param_gather` -- `T5ModelProvider.encoder_pipeline_model_parallel_size` -- `T5ModelProvider.encoder_tensor_model_parallel_size` -- `TransformerConfig.expert_model_parallel_size=` -- `TransformerConfig.expert_tensor_parallel_size=` -- `TransformerConfig.moe_grouped_gemm` -- `DistributedInitConfig.use_torch_fsdp2` -- `TransformerConfig.pipeline_model_parallel_size` -- `TransformerConfig.tensor_model_parallel_size` -- `TransformerConfig.virtual_pipeline_model_parallel_size` -- `OptimizerConfig.use_distributed_optimizer` -- `TORCH_NCCL_AVOID_RECORD_STREAMS` -- `TPOverlapCfg.cga_size` -- `TPOverlapCfg.fp8_buf` -- `TPOverlapCfg.num_sm` -- `TPOverlapCfg.num_split` - - - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/communication-overlap.md -```md -# Communication Overlap - -Communication overlap reduces exposed communication cost in distributed training -by hiding collectives or point-to-point transfers under useful compute. - -This page is the stable guide for what communication overlap is, when it tends -to help, and which boundaries are durable across Megatron Bridge. For exact -knobs, code anchors, and verification commands, see: - -- `skills/perf-techniques/tp-dp-comm-overlap/SKILL.md` -- `skills/perf-techniques/expert-parallel-overlap/SKILL.md` - -## What It Is - -In Bridge, communication overlap is a family of related techniques rather than a -single switch: - -| Mode | What gets hidden | Main gate | -|---|---|---| -| DP | gradient reduce-scatter and parameter all-gather | distributed-optimizer overlap path | -| TP | tensor-parallel collectives under layer compute | `CommOverlapConfig.tp_comm_overlap` plus sequence parallelism | -| PP | pipeline send/recv work under schedule execution | pipeline schedule and virtual pipeline layout | -| CP | context-parallel communication inside CP execution paths | CP implementation choice | -| EP | MoE token dispatch/combine communication under expert compute | `overlap_moe_expert_parallel_comm` | - -These paths share the same goal, but they do not share the same enablement -rules, evidence level, or failure modes. - -## What Problem It Solves - -Distributed training often becomes communication-bound before it becomes -compute-bound. Once TP, DP, PP, CP, or EP traffic is visible on the critical -path, adding more GPUs may raise communication time faster than it raises useful -compute. - -Communication overlap addresses that by moving communication earlier or later in -the step so the same transfer can happen while some other part of the model is -already doing useful work. It does not change the training objective. It tries -to reduce idle time. - -## Impacted Training Dimensions - -| Dimension | Effect | Confidence | Why | -|---|---|---|---| -| `speed` | ~0-15% faster step time, mode-dependent | medium | The whole point is to hide communication time, but gain depends strongly on which overlap mode is active and whether communication is actually exposed. EP overlap measured flat to ~13% slower on small-EP Qwen3-30B-A3B, so gains are not guaranteed. | -| `memory` | neutral (some modes add ~1-2 GB for buffers) | low | Overlap itself is usually not a primary memory technique, although some implementations (e.g., TP userbuffers) add buffer or scheduling constraints. | -| `scale` | positive at higher parallelism degrees | medium | Overlap becomes more valuable as communication dominates larger distributed runs. | -| `convergence` | no change expected | medium | The intent is to preserve the same training math, though schedule changes can alter floating-point accumulation order. | -| `stability` | adds operational constraints | medium | More overlap usually means tighter requirements around schedule shape, precision, runtime versions, and feature combinations. | - -## When to Use It - -Enable communication overlap when all of the following are mostly true: - -- the distributed configuration already works correctly without overlap -- communication is a meaningful part of step time -- you are tuning throughput or utilization, not doing first bring-up -- you can benchmark the specific overlap mode you plan to use - -As a rule of thumb: - -| Mode | Good first use case | Recommendation | -|---|---|---| -| DP | distributed optimizer on multi-GPU or multi-node training | Usually worth considering early once optimizer sharding is already chosen. | -| TP | `TP >= 2` with sequence parallelism and TE-enabled path | Benchmark when TP collectives are visible in the profile. | -| PP | interleaved pipeline schedules where p2p overhead is visible | Treat as schedule tuning, not a blanket PP default. | -| CP | large-context runs already using CP | Follow the CP-specific guidance rather than treating it as a separate generic knob. | -| EP | large-scale MoE with many micro-batches and inter-node A2A cost | Most promising at larger EP and with higher-latency dispatcher backends. | - -Measured repo evidence today is strongest for MoE EP overlap. On -Qwen3-30B-A3B with EP=4 and `alltoall` on 2 H100 nodes, EP overlap is -numerically safe at GBS=8 but provides no speedup, and it is about 13% slower -at GBS=64. On Qwen3-Next-80B-A3B with EP=8 and `alltoall` on 8 nodes, the -overlap variants are stable while the non-overlap baseline NaNs, but -`delay_wgrad_compute` is still about 4.8% slower than overlap-only. That makes -EP overlap correctness-backed in this repo, but not yet broadly speedup-backed. - -## When Not to Use It - -Avoid communication overlap when any of these are true: - -- you are still debugging a new distributed setup -- the profile is compute-bound rather than communication-bound -- the required companion feature is missing, such as sequence parallelism for TP -- another feature already imposes conflicting runtime constraints -- you have not benchmarked the exact model and parallelism shape - -For MoE EP overlap specifically, avoid treating it as a default when: - -- `EP <= 4` with `alltoall` on `<= 2` nodes -- the run has very few pipeline micro-batches -- `moe_shared_expert_overlap` must stay enabled -- full recompute or recompute scheduling incompatible with EP overlap is required - -## Feature Interactions - -The most important interactions are: - -- DP overlap is tied to distributed-optimizer behavior rather than a fully independent tuning path. -- TP overlap depends on sequence parallelism and the supported TE overlap path. -- PP and EP overlap interact with virtual pipeline layout when `PP > 1`. -- CP overlap should be reasoned about together with the chosen CP communication type. -- EP overlap with DeepEP or HybridEP requires explicitly switching the dispatcher to `flex`. -- EP overlap and `moe_shared_expert_overlap` are mutually exclusive. -- CUDA graphs plus `delay_wgrad_compute` adds extra TE-version and graph-scope restrictions. -- Launch-time environment tuning can conflict across overlap paths, especially TP or CP overlap versus DeepEP or HybridEP tuning. - -## Bridge Configuration - -Communication overlap is configured through `CommOverlapConfig` plus -mode-specific model settings. There is no single universal toggle — DP, TP, -PP, CP, and EP each have different prerequisites and should be enabled based -on the actual bottleneck. - -For config examples and minimal runnable commands, see: - -- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md) -- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md) - -## Expected Metric Changes - -| Metric | Expected Change | Conditions | Evidence | -|---|---|---|---| -| `step_time` | down | DP overlap with distributed optimizer on communication-heavy runs | expected | -| `step_time` | down | TP overlap with `TP >= 2`, sequence parallelism, and supported TE path | expected | -| `pipeline_idle_time` | down | interleaved PP where p2p cost is visible | expected | -| `step_time` | flat | Qwen3-30B-A3B, EP=4, `alltoall`, 2 nodes, GBS=8 | measured: 822ms baseline vs 827ms overlap | -| `step_time` | up | same model/config, GBS=64 | measured: 4889ms baseline vs 5538ms overlap | -| `step_time` | up | Qwen3-Next-80B-A3B, EP=8, `alltoall`, 8 nodes, `delay_wgrad_compute=True` vs overlap-only | measured: 4912ms vs 4686ms | - -Do not assume one overlap win transfers automatically to another mode. The -correct question is always "which communication path is exposed in this run?" - -## Common Failure Modes - -- TP overlap silently disables itself when sequence parallelism is off or `TP < 2`. -- PP overlap expectations are wrong when the schedule is non-interleaved or VPP is missing. -- EP overlap asserts when `PP > 1` but `virtual_pipeline_model_parallel_size` is unset. -- EP overlap asserts when full recompute, recompute method, or shared-expert overlap stays enabled. -- Setting `moe_flex_dispatcher_backend` alone does not activate DeepEP or HybridEP; the dispatcher must actually switch to `flex`. -- Small-EP `alltoall` MoE runs can get slower because scheduling overhead is larger than the communication being hidden. - -## Related Docs - -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/cuda-graphs.md](cuda-graphs.md) -- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md) -- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md) -- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md) -- [skills/perf-techniques/moe-comm-overlap/SKILL.md](../skills/perf-techniques/moe-comm-overlap/SKILL.md) -- [skills/perf-techniques/moe-comm-overlap/card.yaml](../skills/perf-techniques/moe-comm-overlap/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/mixed-precision.md -```md -# Mixed Precision Training - -Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. Megatron Bridge supports FP16, BF16, and FP8 via Transformer Engine (TE) across most models through the {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` configuration. - -## Configuration Overview - -Mixed precision is configured in Megatron Bridge through the `mixed_precision` field in {py:class}`bridge.training.config.ConfigContainer`, which accepts either: -- A string name referencing a predefined recipe (e.g., `"bf16_mixed"`) -- A {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` object for custom configurations - -The mixed precision configuration automatically updates the model, optimizer, and distributed data parallel settings with the appropriate precision parameters. - -## Half-Precision Training - -Megatron Bridge supports half-precision FP16 and BF16 computation training via Megatron Core and the distributed optimizer. This training recipe uses half-precision in all layer computation while keeping the model states (optimizer states and master parameters) in single-precision. To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer step. - -### Using Predefined Recipes - -The simplest way to enable mixed precision is using predefined recipe names: - -```python -from megatron.bridge.training.config import ConfigContainer - -# Configure with BF16 mixed precision -config = ConfigContainer( - mixed_precision="bf16_mixed", - # ... other config parameters -) - -# Configure with FP16 mixed precision -config = ConfigContainer( - mixed_precision="fp16_mixed", - # ... other config parameters -) -``` - -### Custom Mixed Precision Configuration - -For more control, create a custom {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig`: - -```python -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -import torch - -# Custom BF16 configuration -bf16_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=True, -) - -config = ConfigContainer( - mixed_precision=bf16_config, - # ... other config parameters -) -``` - -## FP8 Training - -NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. Megatron Bridge uses the NVIDIA TransformerEngine (TE) to leverage speedups from FP8. For a more detailed overview, refer to the [TE documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html), specifically the FP8 format and recipe. - -### FP8 Configuration Parameters - -The {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` provides several FP8-specific parameters: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `fp8` | `Optional[str]` | `None` | FP8 format: `"hybrid"` (E4M3 for activations/weights, E5M2 for gradients) or `"e4m3"` | -| `fp8_recipe` | `str` | `"tensorwise"` | FP8 recipe type: `"tensorwise"`, `"delayed"`, `"blockwise"`, `"mxfp8"` (Blackwell only) | -| `first_last_layers_bf16` | `bool` | `False` | If True, retains first and last N TransformerBlocks in BF16 as opposed to FP8 | -| `num_layers_at_start_in_bf16` | `int` | `0` | Number of layers at the start of the model to keep in BF16 precision when `first_last_layers_bf16` is True | -| `num_layers_at_end_in_bf16` | `int` | `0` | Number of layers at the end of the model to keep in BF16 precision when `first_last_layers_bf16` is True | -| `fp8_margin` | `int` | `0` | Scaling factor shift by $2^{margin}$ | -| `fp8_amax_history_len` | `int` | `1` | Window size for amax history storage | -| `fp8_amax_compute_algo` | `str` | `"most_recent"` | Amax selection algorithm: `"max"` or `"most_recent"` | -| `fp8_param` | `Optional[bool]` | `None` | Store module-level parameters in FP8 | -| `fp8_param_gather` | `bool` | `False` | Enable FP8 parameter gathering | - -### FP8 Recipe Examples - -Use any of the predefined FP8 recipe names with the `mixed_precision` parameter: - -```python -# Example: BF16 with FP8 current scaling -config = ConfigContainer( - mixed_precision="bf16_with_fp8_current_scaling_mixed", - # ... other config parameters -) -``` - -## Available Mixed Precision Recipes - -Megatron Bridge provides numerous predefined mixed precision recipes for different use cases. You can use the {py:func}`~megatron.bridge.training.mixed_precision.get_mixed_precision_config` utility function to convert from a string shortname to a class instance. For the complete list of available recipes and their specific configurations, see the {py:mod}`megatron.bridge.training.mixed_precision` module. - - -### Custom FP8 Configuration - -For advanced use cases, create a custom FP8 configuration: - -```python -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -import torch - -# Custom FP8 configuration -fp8_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - fp8="hybrid", - fp8_recipe="tensorwise", - fp8_margin=0, - fp8_amax_history_len=1024, - fp8_amax_compute_algo="max", - fp8_param_gather=True, -) - -config = ConfigContainer( - mixed_precision=fp8_config, - # ... other config parameters -) -``` - -### Registering Custom Mixed Precision Recipes - -You can also register your own custom mixed precision configurations to work with the shortname system. Use the {py:func}`~megatron.bridge.training.mixed_precision.register` decorator on a function that returns a `MixedPrecisionConfig` object: - -```python -from megatron.bridge.training.mixed_precision import register, MixedPrecisionConfig - -@register -def my_custom_fp8_recipe() -> MixedPrecisionConfig: - """Custom FP8 recipe with specific settings for my use case.""" - return MixedPrecisionConfig( - bf16=True, - fp8="hybrid", - fp8_recipe="tensorwise", - fp8_param_gather=True, - # ... other custom settings - ) - -# Now you can use it with the utility function -config = get_mixed_precision_config("my_custom_fp8_recipe") -``` - -Common recipe categories include: -- **Half-precision recipes**: Basic BF16 and FP16 mixed precision -- **FP8 recipes**: Various FP8 scaling strategies (delayed, current, subchannel) -- **Architecture-specific recipes**: Optimized for specific GPU architectures (Hopper, Blackwell) -- **Model-specific recipes**: Tuned for particular model families - -## Configuration Synchronization - -When a mixed precision configuration is provided, it automatically synchronizes precision-related settings across the model, optimizer, and distributed data parallel (DDP) configurations. This ensures consistent precision behavior throughout the training pipeline. - -**Important**: Mixed precision settings will override any conflicting precision parameters that may have been set directly on the model, optimizer, or DDP configurations. The mixed precision configuration acts as the authoritative source for all precision-related parameters. - -For example, if you specify both: -```python -# This will be overridden -model_config.bf16 = False -optimizer_config.bf16 = False - -config = ConfigContainer( - model=model_config, - optimizer=optimizer_config, - mixed_precision="bf16_mixed", # This takes precedence during training - # ... other configs -) -``` - -The mixed precision configuration will set `bf16=True` on both the model and optimizer configs, overriding the explicitly set `False` values. This synchronization prevents configuration mismatches that could lead to training issues. - -## Performance Considerations - -- **FP8 recipes are experimental** and convergence has not been fully validated for all models -- **BF16** is generally recommended over FP16 for better numerical stability -- **FP8** provides the best performance on H100 GPUs but requires careful tuning -- **MXFP8** recipes are only supported on Blackwell architecture GPUs -- **Blockwise scaling** recipes are optimized for Hopper architecture GPUs - -## Resources - -- [Transformer Engine Documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html) -- [Intro to FP8, floating point formats, and mixed precision training](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8) -- [Performance optimizations](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html) that are natively supported in Megatron Bridge by enabling FP8 training with TE - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/activation-recomputation.md -```md -# Activation Recomputation - -The input activations of network layers are stored in device memory and are used to compute gradients during back-propagation. When training a LLM with a long sequence length or a large micro-batch size, these input activations can quickly saturate device memory. Checkpointing a few activations and recomputing the rest is a common technique to reduce device memory usage. - -Activation recomputation in Megatron Bridge is configured through the model provider's recomputation parameters, which are based on Megatron Core's `TransformerConfig`. - -## Transformer Layer Recomputation - -Megatron Bridge supports transformer layer recomputation, which checkpoints the input of each transformer layer and recomputes the activations for the remaining layers. This technique significantly reduces activation memory usage. However, it increases the per-transformer layer computation cost by 30% due to re-executing the entire layer's forward computation. - -Megatron Bridge also supports partial transformer layer recomputation, which is beneficial when recomputing a few transformer layers helps to reduce enough GPU memory for the model to fit. This approach avoids the need to recompute the rest of the layers. - -### Configuration - -Transformer layer recomputation is configured through the model provider's recomputation parameters: - -```python -from megatron.bridge.models import GPTModelProvider - -# Full recomputation - recompute all layers -model_config = GPTModelProvider( - recompute_granularity="full", # Enable full layer recomputation - recompute_method="uniform", # Uniform distribution across layers - recompute_num_layers=4, # Number of layers per recomputation block - # ... other model parameters -) -``` - -### Recomputation Methods - -#### Block Method -Recomputes a specific number of transformer layers per pipeline stage: - -```python -model_config = GPTModelProvider( - recompute_granularity="full", - recompute_method="block", # Block-wise recomputation - recompute_num_layers=4, # Recompute 4 layers per pipeline stage -) -``` - -#### Uniform Method -Uniformly divides the total number of transformer layers and recomputes input activations for each divided chunk: - -```python -model_config = GPTModelProvider( - recompute_granularity="full", - recompute_method="uniform", # Uniform distribution - recompute_num_layers=8, # Number of layers per recomputation block -) -``` - -### Pipeline Parallelism Considerations - -When training with pipeline parallelism: -- `recompute_num_layers` indicates the layers per pipeline stage -- When using virtual pipelining, `recompute_num_layers` specifies the number of layers per virtual pipeline stage -- The framework automatically handles recomputation coordination across pipeline stages - -![Activation Recomputation Methods](images/activation-recomputation-example-1.jpg) -*Figure 1: Scheme of uniform and block checkpointing method (full checkpointing granularity)* - -## Self-attention Recomputation - -Megatron Bridge supports selective self-attention recomputation that checkpoints the inputs of each self-attention block and recomputes the intermediate input activations. This cost-efficient method achieves high memory savings with minimal recomputation cost. - -The intermediate layers of the self-attention block account for the majority of the activation memory because the input sizes of softmax, dropout, and QKV dot-product attention layers have memory complexity proportional to the sequence length squared. However, their recomputation cost is relatively smaller than other linear projection layers that scale with the hidden size squared. - -![Activation Recomputation Granularity](images/activation-recomputation-example-2.jpg) -*Figure 2: Scheme of full and selective checkpointing granularity* - -### Configuration - -Self-attention recomputation is enabled using selective granularity: - -```python -from megatron.bridge.models import GPTModelProvider - -model_config = GPTModelProvider( - recompute_granularity="selective", # Enable selective recomputation - recompute_modules=["core_attn"], # Recompute attention modules (default) - # ... other model parameters -) -``` - -### Recomputation Modules - -Megatron Bridge supports selective recomputation for various modules: - -```python -model_config = GPTModelProvider( - recompute_granularity="selective", - recompute_modules=[ - "core_attn", # Core attention computation (default) - "mlp", # MLP layers - "layernorm", # Layer normalization - "moe", # Mixture of Experts layers - "moe_act", # MoE activation functions - "shared_experts", # Shared expert layers - "mla_up_proj", # Multi-Latent Attention up projection - ], -) -``` - -### Flash Attention Integration - -Self-attention recomputation is automatically enabled when using Flash Attention through Transformer Engine. Flash Attention inherently provides memory efficiency by recomputing attention scores rather than storing them, making additional explicit recomputation often unnecessary. - -## Advanced Recomputation Configuration - -### Distributed Activation Checkpointing - -For models using model parallelism, you can distribute saved activations across the model parallel group: - -```python -model_config = GPTModelProvider( - recompute_granularity="selective", - distribute_saved_activations=True, # Distribute across model parallel group - # Note: Cannot be used with sequence_parallel=True -) -``` - -### Memory vs Computation Trade-offs - -Different recomputation strategies offer different memory-computation trade-offs: - -- **Selective recomputation**: Provides high memory savings with minimal recomputation cost by targeting memory-intensive operations like attention -- **Full recomputation**: Significantly reduces activation memory usage but increases per-transformer layer computation cost by approximately 30% -- **No recomputation**: Preserves all activations in memory, requiring more GPU memory but no additional computation - -### MoE-Specific Recomputation - -For Mixture of Experts models, specialized recomputation options are available: - -```python -model_config = GPTModelProvider( - # MoE configuration - num_moe_experts=8, - expert_model_parallel_size=2, - - # MoE recomputation - recompute_granularity="selective", - recompute_modules=["moe", "moe_act"], # Recompute MoE-specific modules -) -``` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/megatron-fsdp.md -```md -# Megatron FSDP - -Megatron FSDP is the practical fully sharded data parallel path in Megatron -Bridge today. It shards parameters, gradients, and optimizer state across data -parallel ranks, which can reduce model-state memory substantially compared with -plain Distributed Data Parallel (DDP) or the distributed optimizer path. - -This page is the stable overview for what Megatron FSDP is, when to use it, and -what constraints matter. For operational enablement, code anchors, and -verification commands, see [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md). - -## What It Is - -Megatron FSDP is the Megatron-Core custom FSDP implementation exposed in Bridge -through `use_megatron_fsdp`. - -Compared with other data-parallel strategies: - -| Feature | DDP | Distributed Optimizer | Megatron FSDP | -|---|---|---|---| -| Parameter Storage | Replicated | Replicated | Sharded | -| Optimizer States | Replicated | Sharded | Sharded | -| Gradient Communication | All-reduce | Reduce-scatter | Reduce-scatter | -| Parameter Communication | None | All-gather (after update) | All-gather (on-demand) | -| Memory Efficiency | Baseline | High | Highest | -| Communication Overhead | Low | Medium | Medium-High | - -The practical consequence is that Megatron FSDP is most useful when model-state -memory, rather than activation memory, is the main bottleneck. - -## When to Use It - -Megatron FSDP is a good fit when all of the following are true: - -- the model is too large for plain DDP or distributed optimizer -- you want the strongest currently supported FSDP path in Bridge -- you are willing to trade more communication for lower memory -- you can adopt the required FSDP checkpoint format - -Prefer another path when: - -- DDP already fits comfortably and simplicity matters most -- distributed optimizer gives enough memory relief without fully sharding -- you are evaluating PyTorch FSDP2 for production use on this branch - -## Stable Requirements - -Megatron FSDP in Bridge requires: - -- `use_megatron_fsdp` to be enabled -- checkpoint format `fsdp_dtensor` -- standard rank initialization order - -The `fsdp_dtensor` format uses PyTorch DTensor and -`torch.distributed.checkpoint` (DCP) to store sharded parameters and optimizer -state. It is **not interchangeable** with `torch_dist` or `zarr` checkpoints — -you cannot load an `fsdp_dtensor` checkpoint into a non-FSDP run or vice versa. - -`fsdp_dtensor` is compatible with 5D parallelism (TP + PP + DP + CP + EP). -Because DCP stores DTensor placement metadata, checkpoints saved under one -parallelism layout can be loaded under a different layout (e.g., change TP or PP -size between runs) — DCP handles the shard remapping automatically. The one -unsupported combination is `use_tp_pp_dp_mapping=True`, which uses an -alternative rank-initialization order that conflicts with FSDP sharding. - -Important stable constraints: - -- `use_megatron_fsdp` and `use_torch_fsdp2` are mutually exclusive -- `use_tp_pp_dp_mapping` is not supported with Megatron FSDP -- legacy checkpoint formats such as `torch_dist` and `zarr` are not valid for - Megatron FSDP save/load - -When Megatron FSDP is enabled, Bridge also adjusts some settings -automatically, including disabling `average_in_collective` and several -buffer-reuse optimizations that do not match the FSDP path. - -## Compatibility and Caveats - -At the configuration level, Megatron FSDP is intended to work with: - -- tensor parallelism -- pipeline parallelism -- context parallelism -- expert parallelism -- BF16 or FP16 mixed precision - -However, not every combination has the same level of in-repo validation or -performance evidence. Treat broad compatibility as code-supported first, not as -fully benchmark-proven for every combination. - -Two practical caveats matter most: - -1. Public recipes may expose `use_megatron_fsdp` while still defaulting to a - non-FSDP checkpoint format. The checkpoint requirement is stable and - mandatory even when recipe ergonomics lag behind. -2. FSDP reduces model-state memory, not activation memory. For long-sequence or - activation-bound workloads, other techniques such as context parallelism, - activation recomputation, or CPU offloading may still be needed. - -## Torch FSDP2 Status - -Megatron Bridge also exposes a PyTorch FSDP2 path via `use_torch_fsdp2`, but -that path should still be treated as experimental on this branch. - -The stable recommendation today is: - -- use Megatron FSDP if you need an FSDP path in Bridge -- do not treat FSDP2 as interchangeable with Megatron FSDP - -## Related Docs - -- [docs/training/checkpointing.md](checkpointing.md) -- [docs/training/cpu-offloading.md](cpu-offloading.md) -- [docs/performance-guide.md](../performance-guide.md) -- [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md) -- [skills/perf-techniques/megatron-fsdp/card.yaml](../skills/perf-techniques/megatron-fsdp/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/hybrid-context-parallel.md -```md -# Hybrid / Hierarchical Context Parallel - -This page covers the stable Bridge-facing meaning of hierarchical context -parallelism, especially the `a2a+p2p` transport path and -`hierarchical_context_parallel_sizes`. - -For operational setup, code anchors, and verification commands, see -[skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md). - -## What It Is - -Context parallelism (CP) splits the input sequence across GPUs so each rank -processes a chunk. The GPUs must communicate KV data during attention. There are -several CP communication backends: - -| `cp_comm_type` | Mechanism | Async / Overlap | Constraint | -|---|---|---|---| -| `"p2p"` | Ring-exchange of KV chunks | Yes | None | -| `"all_gather"` | All-gather full KV before attention | No | None | -| `"a2a"` | All-to-all: scatter heads, gather full sequence (Ulysses-style) | N/A | **CP <= num_kv_heads** | -| `"a2a+p2p"` | Hierarchical: a2a within inner group, p2p across outer group | Partial (p2p part) | Requires `hierarchical_context_parallel_sizes` | - -**HCP (`a2a+p2p`)** exists to scale CP beyond the KV head count by combining -a2a (fast, head-parallel) on intra-node links with p2p (async, -sequence-parallel) on inter-node links. - -It is important to separate this from the upstream boolean -`hybrid_context_parallel`, which is a different feature for balancing packed or -variable-length workloads. The two concepts should not be treated as -interchangeable. - -### Why a2a is limited by KV heads - -a2a transposes the parallelism dimension: each rank trades its sequence chunk -for a subset of attention heads. After the all-to-all, every rank has the -**full sequence** but only `heads / CP` heads. This means: - -- `heads / CP` must be a positive integer. -- The bottleneck is KV heads (not Q heads), because in GQA the KV heads are the - indivisible unit. -- If the model has 8 KV heads, pure a2a supports at most CP=8. - -HCP breaks this limit by applying a2a only within a sub-group small enough to -fit within the KV head count. - -## When to Use It - -**Use HCP when ALL of these are true:** - -1. You need CP larger than `num_kv_heads / TP` (pure a2a won't fit). -2. You cannot (or don't want to) increase TP to shrink CP. -3. Your cluster has a clear bandwidth hierarchy (e.g., NVLink intra-node >> IB - inter-node). - -**Prefer pure `a2a` when:** - -- You can adjust TP so that `CP <= num_kv_heads / TP`. This is simpler, avoids - the p2p overhead, and often yields the same throughput with better memory - headroom. - -**Prefer pure `p2p` when:** - -- You have very few KV heads or want maximum CP flexibility. -- Your workload can hide the p2p latency behind compute (long sequences help). - -### Decision example - -Model: 8 KV heads. Cluster: 4 nodes x 8 GPUs. Goal: train 128K sequences. - -| Option | TP | CP | `cp_comm_type` | Notes | -|---|---|---|---|---| -| A | 1 | 16 | `a2a+p2p` with `[8,2]` | a2a intra-node (8 GPUs), p2p across 2 node-groups | -| B | 2 | 4 | `a2a` | CP=4 <= 8 KV heads. Simpler. Often same throughput. | -| C | 1 | 16 | `p2p` | Works but no a2a bandwidth benefit intra-node | - -In practice, **option B is usually preferred** -- benchmarks showed identical -throughput to option A with more memory headroom. - -It should be treated as an advanced feature rather than a default recommendation. - -## Stable Bridge Limitation - -The most important Bridge-specific limitation is that hierarchical context -parallelism is currently supported only on the MPU initialization path. - -In practice, that means: - -- `dist.use_decentralized_pg=False` is the supported Bridge path -- the decentralized process-group path should not be assumed to materialize HCP - groups - -## Stable Constraints - -The durable constraints are: - -- `hierarchical_context_parallel_sizes` must match - `context_parallel_size` multiplicatively -- the usual CP sequence-length divisibility rules still apply -- Transformer Engine version support matters for `a2a+p2p` - -## Recommendation Level - -Use hierarchical context parallelism in Bridge only when you intentionally want -that transport path and are prepared to validate execution-path details. It is -not yet the kind of feature that should be presented as universally safe across -all Bridge initialization modes. - -## Related Docs - -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/communication-overlap.md](communication-overlap.md) -- [skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md) -- [skills/perf-techniques/hybrid-context-parallel/card.yaml](../skills/perf-techniques/hybrid-context-parallel/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/packed-sequences.md -```md -# Packed Sequences - -Packed sequences are a fine-tuning technique that reduces padding waste by -concatenating multiple examples into one pack while preserving sequence -boundaries for attention. In Megatron Bridge, this is primarily a supervised -fine-tuning and PEFT optimization rather than a general pretraining feature. - -This page is the stable overview for what packed sequences are, when to use -them, and which constraints are durable. For operational setup, code anchors, -and verification commands, see [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md). - -## What It Is - -Fine-tuning datasets often contain examples with highly variable lengths. When -those examples are batched conventionally, many tokens in each batch are just -padding. Packed sequences reduce that waste by building longer packs from -multiple examples and carrying boundary metadata into the attention path. - -In Bridge today, there are two distinct packing paths plus long-context -enablement through context parallelism: - -| Path | Use case | Key config | -|---|---|---| -| Offline packed SFT | Text-only finetuning | `packed_sequence_specs` | -| VLM in-batch packing | VLM finetuning | `pack_sequences_in_batch=True` | -| Long-context (CP) | Pretrain / finetune at 16K-128K+ | `context_parallel_size > 1` | - -These are related but they are not the same knob. Offline packed SFT and VLM -in-batch packing solve padding waste; long-context training primarily addresses -activation memory and communication tradeoffs at larger sequence lengths. - -## When to Use It - -Packed sequences are a good fit when all of the following are true: - -- you are doing SFT, PEFT, or VLM finetuning (all three packing paths are - supported; see the path table above) -- your examples have variable lengths and padding waste is significant -- you can tolerate the micro-batch constraints of packed training - -Packed sequences are usually not the right answer when: - -- you are doing standard Megatron-style pretraining, which already concatenates - documents during sampling -- you want long-context training in general, where context parallelism is often - the main technique -- your model family or recipe explicitly opts out of packed-sequence support - -## Stable Constraints - -The durable constraints for packed sequences in Bridge are: - -- packed SFT requires `micro_batch_size == 1` -- when context parallelism is used, sequence length must satisfy the standard - CP divisibility constraints -- for fine-tuning with CP enabled, per-token loss behavior and reduction - settings matter -- CUDA-graph-friendly packed metadata requires additional padding constraints - -Model-family support is not universal. Some families and recipe paths explicitly -opt out of packed sequences or related packing modes. - -## Relationship to Long-Sequence Training - -Packed sequences and long-sequence training are often mentioned together because -both affect sequence layout and memory behavior, but they solve different -problems: - -- packed sequences mainly reduce padding waste in fine-tuning datasets -- long-sequence training mainly addresses activation memory and communication - tradeoffs at larger sequence lengths - -For long-sequence training guidance, see: - -- `docs/performance-guide.md` -- `docs/training/hybrid-context-parallel.md` - -## Practical Caveats - -The most stable caveats to remember are: - -1. Packed-sequence support is recipe- and model-family-specific. -2. Fine-tuning sequence packing should not be assumed to work with every other - training feature. -3. Packed sequences improve efficiency primarily by reducing padding waste, not - by replacing long-context parallelism or memory-planning techniques. - -## Related Docs - -- [docs/training/multi-token-prediction.md](multi-token-prediction.md) -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md) -- [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md) -- [skills/perf-techniques/sequence-packing/card.yaml](../skills/perf-techniques/sequence-packing/card.yaml) -- [skills/perf-techniques/packed-sequences-long-context/SKILL.md](../skills/perf-techniques/packed-sequences-long-context/SKILL.md) -- [skills/perf-techniques/packed-sequences-long-context/card.yaml](../skills/perf-techniques/packed-sequences-long-context/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/parallelism-strategies/SKILL.md -```md ---- -name: parallelism-strategies -description: Operational guide for choosing and combining parallelism strategies in Megatron Bridge, including sizing rules, hardware topology mapping, and combined parallelism configuration. ---- - -# Parallelism Strategy Selection Skill - -For stable background on each parallelism type, see: - -- `docs/parallelisms.md` -- `card.yaml` (co-located) - -## Decision by Model Size - -### Dense models - -| Model size | GPUs | Recommended starting point | -|---|---|---| -| < 1B | 1-8 | DP only | -| 1-10B | 8-16 | TP=2-4 + DP | -| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP | -| 70-175B | 64-256 | TP=8 + PP=4-8 + DP | -| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP | - -### MoE models - -MoE parallelism differs from dense models. Because only a fraction of -parameters are active per token, TP can often stay at 1 or 2 — the active -parameter shard already fits on a single GPU. EP is the primary scaling -dimension, with PP handling cross-node layer distribution. - -| Model (total / active) | TP | PP | EP | Notes | -|---|---|---|---|---| -| OLMoE 7B / 1B | 1 | 1 | 8 | EP only, fits single node | -| Moonlight 16B / 3B | 2 | 1 | 8 | small TP for shared layers | -| DeepSeek-V2 236B / 21B | 1 | 4 | 32 | no TP at all | -| GLM-4.5 Air 106B / 12B | 1 | 4 | 8 | no TP at all | -| Qwen3 30B-A3B | 4 | 2 | 4 | | -| GLM-4.5 355B / 32B | 2 | 8 | 16 | | -| Qwen3 235B-A22B | 4 | 16 | 8 | CP=2 for pretrain | -| DeepSeek-V3 671B / 37B | 2 | 16 | 64 | TP=2, not 8 | -| Kimi-K2 1T | 2 | 16 | 32 | | - -Key patterns: - -- TP is sized by **active** params, not total params. A 671B MoE with - 37B active needs far less TP than a 70B dense model. -- EP scales with expert count. Common: EP = num_experts or - num_experts / experts_per_gpu. -- PP handles depth. Large MoE models use PP=8-16 across nodes. -- ETP (expert tensor parallelism) is rarely used. Llama 4 is an - exception (ETP=4). - -These are starting points, not hard rules. Always profile the first -iteration to verify memory and communication. - -## Decision by Hardware Topology - -Single node with NVLink: - -```python -cfg.model.tensor_model_parallel_size = 8 -``` - -Multiple nodes with InfiniBand: - -```python -cfg.model.tensor_model_parallel_size = 8 -cfg.model.pipeline_model_parallel_size = N -``` - -Limited network (Ethernet): - -```python -cfg.model.tensor_model_parallel_size = 4 -cfg.model.pipeline_model_parallel_size = M -``` - -The stable rule is: keep TP within a single NVLink domain. Use PP or DP -for cross-node scaling. TP across nodes is almost always a performance -loss. - -## Decision by Sequence Length - -| Sequence length | Recommendation | -|---|---| -| < 2K | standard TP + PP + DP | -| 2K-8K | add SP (`sequence_parallel=True`) | -| 8K-32K | add CP=2 | -| 32K+ | add CP=4-8, consider `a2a+p2p` for large CP | - -## Combined Parallelism Enablement - -3D parallelism (TP + PP + DP): - -```python -cfg.model.tensor_model_parallel_size = 4 -cfg.model.pipeline_model_parallel_size = 4 -cfg.model.sequence_parallel = True -``` - -4D parallelism (TP + PP + CP + DP): - -```python -cfg.model.tensor_model_parallel_size = 8 -cfg.model.pipeline_model_parallel_size = 8 -cfg.model.context_parallel_size = 2 -cfg.model.sequence_parallel = True -``` - -MoE with EP + PP (e.g. DeepSeek-V2 236B on 128 GPUs): - -```python -cfg.model.tensor_model_parallel_size = 1 -cfg.model.pipeline_model_parallel_size = 4 -cfg.model.expert_model_parallel_size = 32 -cfg.model.sequence_parallel = False -``` - -MoE with small TP + PP + EP (e.g. DeepSeek-V3 671B on 256 GPUs): - -```python -cfg.model.tensor_model_parallel_size = 2 -cfg.model.pipeline_model_parallel_size = 16 -cfg.model.expert_model_parallel_size = 64 -cfg.model.sequence_parallel = True -``` - -DP size is always implicit: - -``` -data_parallel_size = world_size / (TP * PP * CP) -``` - -## Memory Estimation - -Without parallelism (70B model, FP16): - -``` -parameters: 140 GB -gradients: 140 GB -optimizer states: 280 GB (Adam) -activations: 48 GB (batch=1, seq=4K) -total: 608 GB -``` - -With TP=4, PP=4, DP=4 (64 GPUs): - -``` -parameters: 8.75 GB per GPU -gradients: 8.75 GB per GPU -optimizer states: 17.50 GB per GPU -activations: 3.00 GB per GPU -total: ~38 GB per GPU -``` - -## Code Anchors - -Parallelism dimensions set in model provider: - -```66:81:docs/parallelisms.md -model_config = GPTModelProvider( - tensor_model_parallel_size=2, - # ... other model parameters -) -``` - -DP size calculation: - -```424:436:docs/parallelisms.md -data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size) -``` - -Bridge initialization wires parallelism into process groups: - -```618:628:src/megatron/bridge/training/initialize.py -parallel_state.initialize_model_parallel( - tensor_model_parallel_size=model_config.tensor_model_parallel_size, - pipeline_model_parallel_size=model_config.pipeline_model_parallel_size, - ... - context_parallel_size=model_config.context_parallel_size, - hierarchical_context_parallel_sizes=model_config.hierarchical_context_parallel_sizes, - expert_model_parallel_size=model_config.expert_model_parallel_size, - ... -) -``` - -## Pitfalls - -1. TP across nodes destroys throughput. Always keep TP within a single - NVLink domain. - -2. PP without interleaving has large pipeline bubbles. Use - `virtual_pipeline_model_parallel_size` when possible. - -3. SP requires `tensor_model_parallel_size > 1`. Enabling SP alone - without TP is a config error. - -4. CP requires `seq_length % (2 * context_parallel_size) == 0`. - -5. EP is only for MoE models. Setting `expert_model_parallel_size` on a - dense model is a no-op or error. - -6. The model-size-to-parallelism table above is a starting heuristic. - Always profile the first iteration to check memory and communication. - -7. `CUDA_DEVICE_MAX_CONNECTIONS` and related env vars interact with - overlap settings. See `skills/perf-techniques/tp-dp-comm-overlap/SKILL.md`. - -## Verification - -Quick sanity check that combined parallelism initializes correctly using -the smallest available recipe with overridden parallelism: - -```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 uv run python -m torch.distributed.run --nproc_per_node=4 \ - scripts/training/run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - model.tensor_model_parallel_size=2 \ - model.pipeline_model_parallel_size=2 \ - model.sequence_parallel=True \ - train.train_iters=3 train.global_batch_size=8 train.micro_batch_size=1 \ - scheduler.lr_warmup_iters=0 \ - validation.eval_iters=0 validation.eval_interval=0 \ - checkpoint.save_interval=0 \ - logger.log_interval=1 -``` - -Success criteria: - -- exit code 0 -- finite loss at iteration 3 (e.g. `lm loss: 1.003808E+01`) -- log shows TP=2 PP=2 DP=1 layout with 4 ranks - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/sequence-packing/SKILL.md -```md ---- -name: sequence-packing -description: Operational guide for enabling packed sequences and long-context config paths in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification. ---- - -# Sequence Packing Skill - -For stable background and recommendation level, see: - -- `docs/training/packed-sequences.md` -- `card.yaml` (co-located) - -## Enablement - -Offline packed SFT for LLM finetuning: - -```python -from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs - -cfg.train.micro_batch_size = 1 -cfg.dataset.seq_length = 4096 -cfg.model.seq_length = 4096 -cfg.dataset.dataset_kwargs = {"pad_to_max_length": True} -cfg.dataset.packed_sequence_specs = PackedSequenceSpecs( - packed_sequence_size=4096, - pad_seq_to_mult=1, -) -``` - -If CP is enabled: - -```python -cfg.model.context_parallel_size = 2 -cfg.model.calculate_per_token_loss = True -cfg.ddp.average_in_collective = False -cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2 -``` - -If CUDA graphs are enabled for this packed path: - -```python -cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True -cfg.dataset.dataset_kwargs["pad_to_max_length"] = True -``` - -**Note:** `pad_cu_seqlens = True` also requires a metadata JSON file alongside -the packed dataset (asserted in `src/megatron/bridge/data/datasets/sft.py`). -Custom packed datasets that omit the metadata file will hit an assertion at -dataset initialization. - -In-batch packing for VLM finetuning: - -```python -cfg.dataset.pack_sequences_in_batch = True -cfg.train.micro_batch_size = 2 -``` - -Long-context baseline: - -```python -cfg.model.seq_length = 16384 -cfg.dataset.seq_length = 16384 -cfg.model.context_parallel_size = 2 -``` - -## Code Anchors - -LLM packed SFT config surface: - -```72:97:src/megatron/bridge/recipes/utils/finetune_utils.py -if packed_sequence: - dataset_kwargs = {"pad_to_max_length": True} - packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult) -else: - dataset_kwargs = {} - packed_sequence_specs = None -``` - -Bridge validation: - -```1617:1657:src/megatron/bridge/training/config.py -if self.model.context_parallel_size > 1: - assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, ... - if isinstance(self.dataset, FinetuningDatasetConfig): - assert self.model.calculate_per_token_loss, ... - assert not self.ddp.average_in_collective, ... -... -if ... packed_sequence_size > 0 and self.train.micro_batch_size > 1: - raise ValueError(...) -... -if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1: - raise ValueError(...) -``` - -VLM in-batch runtime: - -```308:327:src/megatron/bridge/training/vlm_step.py -if enable_packing: - ... - ) = pack_batch_sequences( - ... - pad_token_id=0, - pad_to_multiple_of=cp_size * 2 if cp_size > 1 else 1, - ) -``` - -Packed THD runtime constraint: - -```61:64:src/megatron/bridge/training/gpt_step.py -if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1: - raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)") -``` - -## Pitfalls - -1. Offline packed SFT and VLM in-batch packing are different features with opposite micro-batch rules. -2. When CP is enabled, packed sequence lengths must respect `2 * context_parallel_size` divisibility. -3. For finetuning with CP, `calculate_per_token_loss=True` and `ddp.average_in_collective=False` are required. -4. `pad_cu_seqlens=True` also requires `pad_to_max_length=True`. -5. Packing support is model-family-specific. `Qwen3-Next`, `GLM-4.5`, and `Qwen3.5-VL` contain explicit opt-outs in different paths. -6. MTP finetuning is documented as incompatible with packed sequences. - -## Verification - -Use the checked-in unit coverage: - -```bash -uv run python -m pytest tests/unit_tests/training/utils/test_packed_seq_utils.py -v && \ -uv run python -m pytest tests/unit_tests/training/test_config.py -k "packed_sequence or pack_sequences_in_batch or context_parallel_seq_length_divisibility or context_parallel_finetuning_validations" -v && \ -uv run python -m pytest tests/unit_tests/training/test_vlm_step.py -k "enable_packing" -v -``` - -Success criteria: - -- first command reports `8 passed` -- second command reports `14 passed` -- third command reports `2 passed` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/tp-dp-comm-overlap/SKILL.md -```md ---- -name: tp-dp-comm-overlap -description: Operational guide for enabling TP, DP, and PP communication overlap in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification. ---- - -# TP / DP / PP Communication Overlap Skill - -For stable background and recommendation level, see: - -- `docs/training/communication-overlap.md` - -## Enablement - -Minimal Bridge override: - -```python -from megatron.bridge.training.comm_overlap import CommOverlapConfig - -cfg.model.tensor_model_parallel_size = 4 -cfg.model.sequence_parallel = True -cfg.model.pipeline_model_parallel_size = 4 -cfg.model.virtual_pipeline_model_parallel_size = 2 - -cfg.comm_overlap = CommOverlapConfig( - tp_comm_overlap=True, -) - -cfg.ddp.use_distributed_optimizer = True -cfg.ddp.overlap_grad_reduce = True -cfg.ddp.overlap_param_gather = True -``` - -Optional TP preset: - -```python -from megatron.bridge.training.comm_overlap import userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048 - -cfg.comm_overlap.tp_comm_overlap_cfg = userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048 -``` - -Precision knobs belong to mixed precision: - -```python -cfg.mixed_precision.grad_reduce_in_fp32 = False -cfg.mixed_precision.fp8_param_gather = False -``` - -## Code Anchors - -Bridge overlap gating: - -```439:449:src/megatron/bridge/training/comm_overlap.py -if self.user_comm_overlap_cfg.tp_comm_overlap is True: - if model_cfg.tensor_model_parallel_size < 2: - ... - elif not model_cfg.sequence_parallel: - ... - elif not HAVE_TE: - ... -``` - -PP overlap selection: - -```451:458:src/megatron/bridge/training/comm_overlap.py -if model_cfg.pipeline_model_parallel_size > 1: - if vp_size > 1: - comm_overlap_cfg.overlap_p2p_comm = True - comm_overlap_cfg.batch_p2p_comm = False - else: - comm_overlap_cfg.overlap_p2p_comm = False - comm_overlap_cfg.batch_p2p_comm = True -``` - -DP overlap defaults: - -```572:579:src/megatron/bridge/training/comm_overlap.py -if self.data_parallel_size > 1: - comm_overlap_cfg.bucket_size = 128 * 1024 * 1024 - comm_overlap_cfg.overlap_grad_reduce = True - comm_overlap_cfg.overlap_param_gather = True -``` - -Launch-time env tuning: - -```570:609:src/megatron/bridge/recipes/run_plugins.py -executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(cuda_device_max_connections) -... -executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin) -executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin) -``` - -## Pitfalls - -1. TP overlap silently disables itself if `sequence_parallel=False` or Transformer Engine is unavailable. -2. PP overlap is not enabled for all PP cases. Bridge only auto-selects `overlap_p2p_comm=True` when `PP > 1` and `VPP > 1`. -3. `bucket_size` is a parameter-count knob, not a byte-size knob. -4. `grad_reduce_in_fp32` and `fp8_param_gather` should be set through mixed precision, not as standalone DDP tuning first. -5. `CUDA_DEVICE_MAX_CONNECTIONS` and LayerNorm SM margin are launch-time plugin settings, not `CommOverlapConfig` fields. - -## Verification - -Use the checked-in overlap unit coverage first: - -```bash -uv run python -m pytest tests/unit_tests/training/test_comm_overlap.py -q -``` - -Optional second check if `nemo_run` is available: - -```bash -uv run python -m pytest tests/unit_tests/recipes/test_run_plugins.py -q -``` - -Success criteria: - -- first command reports `26 passed` -- second command validates plugin-owned env wiring when not skipped - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/cuda-graphs/SKILL.md -```md ---- -name: cuda-graphs -description: Validate and use CUDA graph capture in Megatron Bridge, including local full-iteration graphs and Transformer Engine scoped graphs for attention, MLP, and MoE modules. ---- - -# CUDA Graphs - -Stable docs: `docs/training/cuda-graphs.md` -Card: `card.yaml` (co-located) - -## What It Is - -CUDA graphs capture GPU operations once and replay them with minimal -host-driver overhead. Bridge supports two implementations: - -| `cuda_graph_impl` | Mechanism | Scope support | -|---|---|---| -| `"local"` | MCore `FullCudaGraphWrapper` wrapping entire fwd+bwd | `full_iteration` | -| `"transformer_engine"` | TE `make_graphed_callables()` per layer | `attn`, `mlp`, `moe`, `moe_router`, `moe_preprocess`, `mamba` | - -## Enablement - -### Local full-iteration graph - -```python -cfg.model.cuda_graph_impl = "local" -cfg.model.cuda_graph_scope = ["full_iteration"] -cfg.model.cuda_graph_warmup_steps = 3 -cfg.model.use_te_rng_tracker = True -cfg.rng.te_rng_tracker = True -cfg.rerun_state_machine.check_for_nan_in_loss = False -cfg.ddp.check_for_nan_in_grad = False -``` - -### TE scoped graph (dense model) - -```python -cfg.model.cuda_graph_impl = "transformer_engine" -cfg.model.cuda_graph_scope = ["attn"] # or ["attn", "mlp"] -cfg.model.cuda_graph_warmup_steps = 3 -cfg.model.use_te_rng_tracker = True -cfg.rng.te_rng_tracker = True -``` - -### TE scoped graph (MoE model) - -```python -cfg.model.cuda_graph_impl = "transformer_engine" -cfg.model.cuda_graph_scope = ["attn", "moe_router", "moe_preprocess"] -cfg.model.cuda_graph_warmup_steps = 3 -cfg.model.use_te_rng_tracker = True -cfg.rng.te_rng_tracker = True -``` - -### Performance harness CLI - -```bash -python scripts/performance/run_performance_workload.py \ - --cuda_graph_impl transformer_engine \ - --cuda_graph_scope attn moe_router moe_preprocess \ - ... -``` - -Valid CLI values live in `scripts/performance/argument_parser.py`: -- `VALID_CUDA_GRAPH_IMPLS`: `["none", "local", "transformer_engine"]` -- `VALID_CUDA_GRAPH_SCOPES`: `["full_iteration", "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba"]` - -### Required constraints - -- `use_te_rng_tracker = True` (enforced in `gpt_provider.py`) -- `full_iteration` scope only with `cuda_graph_impl = "local"` -- `full_iteration` scope requires `check_for_nan_in_loss = False` -- Do not combine `moe` scope and `moe_router` scope -- Tensor shapes must be static (fixed seq_length, fixed micro_batch_size) -- MoE token-dropless routing limits graphable scope to dense modules -- With `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`, set - `NCCL_GRAPH_REGISTER=0` (MCore enforces for local impl on arch < sm_100; - TE impl asserts unconditionally) -- CPU offloading is incompatible with CUDA graphs -- `moe_preprocess` scope requires `moe_router` scope to also be set - -## Code Anchors - -### Bridge config and validation - -```1524:1531:src/megatron/bridge/training/config.py - # CUDA graph scope validation: check_for_nan_in_loss must be disabled with full_iteration graph - if self.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in self.model.cuda_graph_scope: - assert not self.rerun_state_machine.check_for_nan_in_loss, ( - "check_for_nan_in_loss must be disabled when using full_iteration CUDA graph. " - "Set rerun_state_machine.check_for_nan_in_loss=False." - ) - if self.model.cuda_graph_impl == "none": - self.model.cuda_graph_scope = [] -``` - -### TE RNG tracker requirement - -```213:216:src/megatron/bridge/models/gpt_provider.py - if self.cuda_graph_impl != "none": - assert getattr(self, "use_te_rng_tracker", False), ( - "Transformer engine's RNG tracker is required for cudagraphs, it can be " - "enabled with use_te_rng_tracker=True'." -``` - -### Graph creation and capture in training loop - -```231:255:src/megatron/bridge/training/train.py - # Capture CUDA Graphs. - cuda_graph_helper = None - if model_config.cuda_graph_impl == "transformer_engine": - cuda_graph_helper = TECudaGraphHelper(...) - # ... - if config.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in config.model.cuda_graph_scope: - forward_backward_func = FullCudaGraphWrapper( - forward_backward_func, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps - ) -``` - -### TE graph capture after warmup - -```338:350:src/megatron/bridge/training/train.py - # Capture CUDA Graphs after warmup. - if ( - model_config.cuda_graph_impl == "transformer_engine" - and cuda_graph_helper is not None - and not cuda_graph_helper.graphs_created() - and global_state.train_state.step - start_iteration == model_config.cuda_graph_warmup_steps - ): - if model_config.cuda_graph_warmup_steps > 0 and should_toggle_forward_pre_hook: - disable_forward_pre_hook(model, param_sync=False) - cuda_graph_helper.create_cudagraphs() - if model_config.cuda_graph_warmup_steps > 0 and should_toggle_forward_pre_hook: - enable_forward_pre_hook(model) - cuda_graph_helper.cuda_graph_set_manual_hooks() -``` - -### RNG initialization - -```199:206:src/megatron/bridge/training/initialize.py - _set_random_seed( - rng_config.seed, - rng_config.data_parallel_random_init, - rng_config.te_rng_tracker, - rng_config.inference_rng_tracker, - use_cudagraphable_rng=(model_config.cuda_graph_impl != "none"), - pg_collection=pg_collection, - ) -``` - -### Delayed wgrad + CUDA graph interaction - -```522:555:src/megatron/bridge/training/comm_overlap.py - cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", []) or [] - # ... scope parsing ... - if wgrad_in_graph_scope: - assert is_te_min_version("2.12.0"), ... - assert model_cfg.gradient_accumulation_fusion, ... - if attn_scope_enabled: - assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, ... -``` - -### Perf harness override helper - -```102:124:scripts/performance/utils/overrides.py -def _set_cuda_graph_overrides( - recipe, cuda_graph_impl=None, cuda_graph_scope=None -): - # Sets impl, scope, and auto-enables te_rng_tracker -``` - -### Graph cleanup - -```1414:1441:src/megatron/bridge/training/train.py -def _delete_cuda_graphs(cuda_graph_helper): - # Deletes FullCudaGraphWrapper and TE graph objects to free NCCL buffers -``` - -### MCore classes (in 3rdparty/Megatron-LM) - -- `CudaGraphManager`: `megatron/core/transformer/cuda_graphs.py` -- `TECudaGraphHelper`: `megatron/core/transformer/cuda_graphs.py` -- `FullCudaGraphWrapper`: `megatron/core/full_cuda_graph.py` -- `CudaGraphScope` enum: `megatron/core/transformer/enums.py` - -### Positive recipe anchors - -- `scripts/performance/configs/deepseek/deepseek_workload_base_configs.py` -- `scripts/performance/configs/qwen/qwen3_workload_base_configs.py` -- `scripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py` - -### Tests - -| File | Coverage | -|---|---| -| `tests/unit_tests/training/test_config.py` | `full_iteration` NaN-check constraint | -| `tests/unit_tests/training/test_comm_overlap.py` | `delay_wgrad` + CUDA graph interaction | -| `tests/unit_tests/models/test_gpt_full_te_layer_autocast_spec.py` | TE autocast with CUDA graphs | -| `tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py` | End-to-end local and TE graph smoke tests | -| `tests/unit_tests/recipes/kimi/test_kimi_k2.py` | TE + CUDA graph recipe config | -| `tests/unit_tests/recipes/gpt/test_gpt3_175b.py` | TE + CUDA graph recipe config | -| `tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py` | VLM CUDA graph settings | - -## Pitfalls - -1. **TE RNG tracker is mandatory**: Setting `cuda_graph_impl` without - `use_te_rng_tracker=True` and `rng.te_rng_tracker=True` will assert - in the provider. - -2. **`full_iteration` requires NaN checks disabled**: The entire fwd+bwd is - captured, so loss-NaN checking cannot inspect intermediate values. - -3. **MoE scope restrictions**: `moe` scope and `moe_router` scope are - mutually exclusive. Token-dropless MoE can only graph `moe_router` and - `moe_preprocess`, not the full expert dispatch. - -4. **Memory overhead**: CUDA graphs pin all intermediate buffers for the - graph's lifetime (no memory reuse). TE scoped graphs add a few GB; - full-iteration graphs can increase peak memory by 1.5–2×. `PP > 1` - compounds overhead since each stage holds its own graph. - -5. **Delayed wgrad interaction**: When `delay_wgrad_compute=True` and - attention or MoE router is in `cuda_graph_scope`, additional constraints - apply: TE >= 2.12.0, `gradient_accumulation_fusion=True`, and no - attention bias. - -6. **Variable-length sequences break graphs**: Sequence lengths must be - constant across steps. Use padded packed sequences if packing is needed. - -7. **Graph cleanup is required**: CUDA graph objects hold NCCL buffer - references. Bridge handles this in `_delete_cuda_graphs()` at the end - of training, but early exits must call it explicitly. - -8. **Older GPU architectures**: On GPUs with compute capability < 10.0 - (pre-Blackwell), set `NCCL_GRAPH_REGISTER=0` when using - `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`. Enforced in MCore - `CudaGraphManager` (cuda_graphs.py:1428) and `TECudaGraphHelper` - (cuda_graphs.py:1697). The TE impl asserts unconditionally regardless - of arch. - -9. **CPU offloading incompatible**: CUDA graphs cannot be used with CPU - offloading. Enforced in MCore `transformer_config.py:1907`. - -10. **MoE recompute + moe_router scope**: MoE recompute is not supported - with `moe_router` CUDA graph scope when using `cuda_graph_impl = - "transformer_engine"`. Enforced in MCore `transformer_config.py:1977`. - -## Verification - -### Unit tests - -```bash -uv run python -m pytest \ - tests/unit_tests/training/test_config.py -k "cuda_graph" \ - tests/unit_tests/training/test_comm_overlap.py -k "cuda_graph" \ - tests/unit_tests/models/test_gpt_full_te_layer_autocast_spec.py -k "cuda_graph" -q -``` - -### Functional smoke test (requires GPU) - -```bash -uv run python -m pytest \ - tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py -q -``` - -### Success criteria - -- Unit tests pass, covering config validation for both `local` and - `transformer_engine` implementations. -- Functional test completes training steps with both CUDA graph - implementations. -- No NCCL errors or illegal memory access in logs. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/config.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import warnings -from abc import ABC, abstractmethod -from dataclasses import MISSING, dataclass, field, fields -from pathlib import Path -from typing import Any, Dict, Literal, Optional, Tuple, Union - -import torch -from megatron.core.datasets.gpt_dataset import GPTDatasetConfig as MCoreGPTDatasetConfig -from megatron.core.distributed import DistributedDataParallelConfig as MCoreDistributedDataParallelConfig -from megatron.core.optimizer import OptimizerConfig as MCoreOptimizerConfig -from megatron.core.optimizer import ( - ParamGroupOverride, - ParamKey, -) -from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.enums import AttnBackend, CudaGraphScope -from megatron.core.transformer.module import MegatronModule -from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig -from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig -from megatron.training.config import CheckpointConfig as MTrainCheckpointConfig -from megatron.training.config import DistributedInitConfig as MTrainDistributedInitConfig -from megatron.training.config import LoggerConfig as MTrainLoggerConfig -from megatron.training.config import ProfilingConfig as MTrainProfilingConfig -from megatron.training.config import RerunStateMachineConfig as MTrainRerunStateMachineConfig -from megatron.training.config import RNGConfig, ValidationConfig -from megatron.training.config import SchedulerConfig as MTrainSchedulerConfig -from megatron.training.config import StragglerDetectionConfig as MTrainStragglerDetectionConfig -from megatron.training.config import TrainingConfig as MTrainTrainingConfig - -from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs -from megatron.bridge.models import GPTModelProvider, T5ModelProvider -from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig -from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig -from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider -from megatron.bridge.models.mimo.mimo_provider import MimoModelProvider -from megatron.bridge.peft.base import PEFT -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.flex_dispatcher_backend import validate_flex_dispatcher_backend -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config -from megatron.bridge.training.tokenizers.config import TokenizerConfig -from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer -from megatron.bridge.training.utils.config_utils import _ConfigContainerBase as Container -from megatron.bridge.utils.common_utils import ( - get_world_size_safe, - print_rank_0, - warn_rank_0, -) - - -@dataclass -class DistributedDataParallelConfig(MCoreDistributedDataParallelConfig): - """Megatron Core DistributedDataParallelConfig with deferred post-init. - - This class inherits from Megatron Core's DistributedDataParallelConfig but defers the - execution of post_init() until finalize() is explicitly called. This allows - for field modifications after construction but before computed fields are calculated. - """ - - param_name_patterns_for_fp32_local_accumulation: Tuple[str, ...] = () - """fnmatch patterns selecting parameters whose gradients should be locally - accumulated in FP32. The special pattern ``'all'`` matches every parameter. - Synced from MCore c586f6d56 (#4028); field will be inherited from the base - class after the next mcore bump.""" - - def __post_init__(self) -> None: - """Skip MCore post_init during initial construction. - - The original post_init logic is deferred until finalize() is called. - """ - pass - - def finalize(self) -> None: - """Execute the deferred MCore post-init logic. - - This method calls the original Megatron Core DistributedDataParallelConfig.__post_init__() - to compute derived fields based on the current field values. - """ - super().__post_init__() - - -@dataclass -class OptimizerConfig(MCoreOptimizerConfig): - """Megatron Core OptimizerConfig with deferred post-init. - - This class inherits from Megatron Core's OptimizerConfig but defers the - execution of post_init() until finalize() is explicitly called. This allows - for field modifications after construction but before computed fields are calculated. - """ - - def __post_init__(self) -> None: - """Skip MCore post_init during initial construction. - - The original post_init logic is deferred until finalize() is called. - """ - pass - - def finalize(self) -> None: - """Execute the deferred MCore post-init logic. - - This method calls the original Megatron Core OptimizerConfig.__post_init__() - to compute derived fields based on the current field values. - """ - super().__post_init__() - - -@dataclass(kw_only=True) -class DistributedInitConfig(MTrainDistributedInitConfig): - """Configuration settings for distributed training initialization.""" - - external_gpu_device_mapping: bool = False - """If True, indicates that GPU device mapping has been externally managed - (e.g., via CUDA_VISIBLE_DEVICES environment variable). When True, uses device 0 - instead of local rank for CUDA device selection. This is useful when launching - with external process managers that handle GPU visibility. - """ - - enable_megatron_core_experimental: bool = False - """Enable experimental features for Megatron Core.""" - - use_decentralized_pg: bool = False - """Use ProcessGroupCollection passed through functions instead of relying on mcore's - global parallel state (mpu) variables. When True, parallel groups are obtained from - the pg_collection object rather than the global megatron.core.parallel_state module.""" - - @property - def lazy_init(self) -> bool: - return self.lazy_mpu_init - - @lazy_init.setter - def lazy_init(self, value: bool) -> None: - self.lazy_mpu_init = value - - -@dataclass(kw_only=True) -class RerunStateMachineConfig(MTrainRerunStateMachineConfig): - """Configuration for the rerun state machine used for result validation or stats.""" - - rerun_mode: Literal["disabled", "validate_results", "report_determinism_stats"] = "disabled" - """Use re-run engine to validate results (default) or to emit stats - on variability of computations due to non-deterministic algorithms.""" - - spiky_loss_factor: float = 10.0 - """Factor for detecting spiky loss. A loss is considered spiky if it exceeds - this multiple of the max observed loss over the sample window.""" - - -@dataclass(kw_only=True) -class DataloaderConfig: - """Base configuration for data loading.""" - - dataloader_type: Optional[Literal["single", "cyclic", "batch", "external"]] = None - """Dataloader type: 'single' for single pass, 'cyclic' for multiple passes with shuffling, - 'batch' for global batch sampling (used in fine-tuning), or 'external' for custom dataloaders.""" - - num_workers: int = 2 - """Dataloader number of workers.""" - - data_sharding: bool = True - """Disable data sharding.""" - - pin_memory: bool = True - """Whether to pin memory during data loading for faster GPU training.""" - - drop_last: bool = True - """Whether to drop the last incomplete batch.""" - - persistent_workers: bool = True - """Whether to keep data loading workers persistent across epochs. - Automatically set to False when num_workers is 0.""" - - trust_remote_code: Optional[bool] = None - """Whether remote code execution should be trusted for a given HF path.""" - - def finalize(self): - """Finalize dataloader config field constraints.""" - if self.num_workers == 0 and self.persistent_workers: - self.persistent_workers = False - - -@dataclass(frozen=True) -class DatasetBuildContext: - """Interface that encapsulates framework internals. - - This context provides metadata needed to build datasets - while hiding implementation details of the framework. - - Attributes: - train_samples: Number of samples for training dataset - valid_samples: Number of samples for validation dataset - test_samples: Number of samples for test dataset - tokenizer: Optional tokenizer instance for text processing - pg_collection: Optional process group collection for distributed training - """ - - train_samples: int - valid_samples: int - test_samples: int - tokenizer: Optional[MegatronTokenizer] = None - pg_collection: Optional[ProcessGroupCollection] = None - - -@dataclass(frozen=True) -class OptimizerConfigOverrideProviderContext: - """Context for providing config overrides.""" - - scheduler_config: "SchedulerConfig" - optimizer_config: OptimizerConfig - model: Union[MegatronModule, list[MegatronModule]] - - -@dataclass -class OptimizerConfigOverrideProvider: - """Abstract base class for providing config overrides.""" - - def build_config_overrides( - self, context: OptimizerConfigOverrideProviderContext - ) -> dict[ParamKey, ParamGroupOverride] | None: - """Build config overrides for weight decay based on scheduler configuration. - - This function creates parameter-specific overrides for weight decay behavior. - By default, weight decay is skipped for bias parameters and 1D parameters. - For Qwen3-Next models, weight decay is applied to q_layernorm and k_layernorm. - - Args: - context: OptimizerConfigOverrideProviderContext which packages the scheduler - configuration, optimizer configuration, and model. - - Returns: - Dictionary of ParamKey to ParamGroupOverride for the optimizer - """ - model = context.model - scheduler_config = context.scheduler_config - optimizer_config = context.optimizer_config - - config_overrides: dict[ParamKey, ParamGroupOverride] = {} - - # Collect param names that should skip weight decay - # NOTE: this can be simplified once https://github.com/NVIDIA/Megatron-LM/pull/2753 - # is merged into dev. Then we can re-use megatron's apply_wd_to_qk_layernorm option - # and call megatron.core.optimizer.get_standard_config_overrides(optimizer_config) - # directly for standard settings, replacing the custom logic below for qwen3-next. - no_wd_names: list[str] = [] - is_qwen3_next = scheduler_config.no_weight_decay_cond_type == "qwen3_next" - - model_list = model if isinstance(model, list) else [model] - for model_chunk in model_list: - for name, param in model_chunk.named_parameters(): - # Skip weight decay for bias parameters - if name.endswith(".bias"): - no_wd_names.append(name) - continue - - # Skip weight decay for 1D parameters - if len(param.shape) == 1: - if is_qwen3_next: - # Qwen3-Next: apply weight decay to qk layernorm (don't add to skip list) - if "q_layernorm" in name or "k_layernorm" in name: - continue - no_wd_names.append(name) - - # Create a single ParamKey with all names that should skip weight decay - if no_wd_names: - no_wd_key = ParamKey(name=tuple(no_wd_names)) - config_overrides[no_wd_key] = ParamGroupOverride(wd_mult=0.0) - - # Now handle decoupled LR: - if optimizer_config.decoupled_lr is not None: - decoupled_lr_config: ParamGroupOverride = {"max_lr": optimizer_config.decoupled_lr} - decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter") - if optimizer_config.decoupled_min_lr is not None: - decoupled_lr_config["min_lr"] = optimizer_config.decoupled_min_lr - config_overrides[decoupled_param_key] = decoupled_lr_config - - return config_overrides if config_overrides else None - - -@dataclass -class DatasetProvider(DataloaderConfig, ABC): - """Abstract base class for custom dataset configurations. - - Provides an interface for users to implement their own dataset builders - while automatically inheriting all DataloaderConfig functionality. - - Users must: - 1. Inherit from this class - 2. Implement the build_datasets() method - - Example: - @dataclass - class S3DatasetConfig(DatasetProvider): - bucket_name: str - data_prefix: str - seq_length: int - - def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]: - # Custom implementation to load data from S3 - train_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/train", context.tokenizer) - valid_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/valid", context.tokenizer) - test_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/test", context.tokenizer) - return train_ds, valid_ds, test_ds - """ - - @abstractmethod - def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]: - """Build train, validation, and test datasets. - - This method is called by the framework during dataset initialization. - Implementations should use the provided context to create appropriate - datasets for each split. - - Args: - context: Build context with sample counts and tokenizer - - Returns: - Tuple of (train_dataset, valid_dataset, test_dataset) - Any element can be None if that split shouldn't be created. - - Raises: - NotImplementedError: Must be implemented by subclasses - """ - pass - - -@dataclass -class GPTDatasetConfig(MCoreGPTDatasetConfig, DataloaderConfig): - """Megatron Core GPTDatasetConfig with deferred post-init. - - This class inherits from MCore's GPTDatasetConfig and DataloaderConfig but defers the - execution of post_init() until finalize() is explicitly called. This allows - for field modifications after construction but before computed fields are calculated. - """ - - data_path: str | list[str] | None = None - """CLI-friendly alternative to ``blend``. Accepts a single path string, - a space-separated multi-path string, or a list of paths (with optional - interleaved weights, matching Megatron-LM ``--data-path`` semantics). - Converted to ``blend`` automatically during ``finalize()``.""" - - def __init__( - self, - seq_length: int | None = None, - skip_getting_attention_mask_from_dataset: bool = True, - data_path: str | list[str] | None = None, - *args, - **kwargs, - ): - """ - Args: - seq_length (int | None): the sequence length. If not provided, `sequence_length` must be in kwargs. - skip_getting_attention_mask_from_dataset (bool): if set, the dataset will pass a None attention mask - and the attention mask is autogenerated from the attn backend. - data_path: CLI-friendly data path(s). Converted to ``blend`` in ``finalize()``. - """ - self.skip_getting_attention_mask_from_dataset = skip_getting_attention_mask_from_dataset - self.data_path = data_path - - if seq_length is not None: - kwargs["sequence_length"] = seq_length - elif "sequence_length" not in kwargs: - raise ValueError("Either `seq_length` or `sequence_length` must be provided.") - - dataloader_kwargs = {k: kwargs.pop(k) for k in list(kwargs) if k in DataloaderConfig.__dataclass_fields__} - MCoreGPTDatasetConfig.__init__(self, *args, **kwargs) - DataloaderConfig.__init__(self, **dataloader_kwargs) - - def __post_init__(self) -> None: - """Skip MCore post_init during initial construction. - - The original post_init logic is deferred until finalize() is called. - """ - pass - - @property - def seq_length(self): - """Alias for MCore's `sequence_length` field.""" - return getattr(self, "sequence_length", None) - - @seq_length.setter - def seq_length(self, value): - setattr(self, "sequence_length", value) - - def finalize(self) -> None: - """Execute the deferred MCore post-init logic and Bridge-specific checks. - - This method calls the original Megatron Core GPTDatasetConfig.__post_init__() - and then performs Bridge-specific validation. - """ - if self.blend is None and self.data_path is not None: - from megatron.core.datasets.utils import get_blend_from_list - - if isinstance(self.data_path, str): - paths = self.data_path.split() - else: - paths = list(self.data_path) - self.blend = get_blend_from_list(paths) - - # Call MCore's post_init - super(MCoreGPTDatasetConfig, self).__post_init__() - - assert self.reset_position_ids is not None, "reset_position_ids must be defined." - assert self.reset_attention_mask is not None, "reset_attention_mask must be defined." - assert self.eod_mask_loss is not None, "eod_mask_loss must be defined." - - DataloaderConfig.finalize(self) - - -@dataclass -class GPTFIMDatasetConfig(GPTDatasetConfig): - """Configuration object forGPT FIM datasets""" - - def __init__( - self, - fim_rate: float = None, - fim_spm_rate: float = None, - fim_extra_tokens: Dict = None, - fim_split_sample: Optional[str] = None, - fim_fragment_rate: Optional[float] = None, - fim_no_prefix: Optional[str] = None, - **kwargs, - ): - """ - Args: - fim_rate: float: probability to convert a training sample into a FIM format. - fim_spm_rate (float): probability that the a FIM sample uses the SPM format over the PSM format. - fim_extra_tokens (Dict): should consist of prefix, middle, suffix, PAD, and EOD tokens. - fim_split_sample (str): string around which to split the sample for FIM. - fim_fragment_rate (float): rate of FIM on each fragment when split_sample is not None. - fim_no_prefix (str): do not apply FIM to fragments that start with this prefix. - """ - self.fim_data = True - self.fim_rate = fim_rate - self.fim_spm_rate = fim_spm_rate - self.fim_extra_tokens = fim_extra_tokens - self.fim_split_sample = fim_split_sample - self.fim_fragment_rate = fim_fragment_rate - self.fim_no_prefix = fim_no_prefix - - super().__init__(**kwargs) - - -@dataclass -class MockGPTDatasetConfig(GPTDatasetConfig): - """Modifies GPTDatasetConfig to enforce necessary options for creating a mock dataset.""" - - def __init__( - self, - seq_length: int, - **kwargs, - ): - super().__init__(seq_length=seq_length, **kwargs) - - def finalize(self): - """ """ - # Raise TypeError if `blend` or `blend_per_split` is not None - if self.__dict__.get("blend", None): - raise TypeError("got an unexpected keyword argument 'blend'") - if self.__dict__.get("blend_per_split", None): - raise TypeError("got an unexpected keyword argument 'blend_per_split'") - if self.__dict__.get("blend", None) and self.__dict__.get("blend_per_split", None): - raise TypeError("got an unexpected keyword argument") - - # Drop `blend` and `blend_per_split` from __dict__ - self.__dict__.pop("blend", None) - self.__dict__.pop("blend_per_split", None) - - return super().finalize() - - -@dataclass(kw_only=True) -class FinetuningDatasetConfig(DataloaderConfig): - """Configuration specific to finetuning datasets, inheriting from DataloaderConfig. - - Note: For fine-tuning, dataloader_type defaults to 'batch' which ensures sequences - within each global batch are padded to the same length. - """ - - dataloader_type: Optional[Literal["single", "cyclic", "batch", "external"]] = "batch" - """Dataloader type for fine-tuning. Defaults to 'batch' for optimal padding behavior.""" - - dataset_root: Optional[Union[str, Path]] = None - seq_length: int - seed: int = 1234 - memmap_workers: int = 1 - max_train_samples: Optional[int] = None - packed_sequence_specs: Optional[PackedSequenceSpecs] = None - dataset_kwargs: Optional[dict[str, Any]] = None - do_validation: bool = True - do_test: bool = True - - -@dataclass(kw_only=True) -class SchedulerConfig(MTrainSchedulerConfig): - """Configuration settings for the learning rate scheduler and weight decay.""" - - def finalize(self) -> None: - """Post-initialization checks for scheduler config.""" - if self.start_weight_decay is not None: - assert self.start_weight_decay >= 0.0, "start_weight_decay should be positive." - assert self.end_weight_decay >= self.start_weight_decay - - if self.override_opt_param_scheduler: - assert not self.use_checkpoint_opt_param_scheduler, "both override and use-checkpoint are set." - - # Validate mutual exclusivity between iteration-based and sample-based scheduler fields - has_iter_fields = ( - self.lr_decay_iters is not None or self.lr_warmup_iters != 0 or self.lr_wsd_decay_iters is not None - ) - has_sample_fields = ( - self.lr_decay_samples is not None or self.lr_warmup_samples != 0 or self.lr_wsd_decay_samples is not None - ) - - assert not (has_iter_fields and has_sample_fields), ( - f"Cannot mix iteration-based and sample-based scheduler fields. " - f"Found iteration fields: lr_decay_iters={self.lr_decay_iters}, lr_warmup_iters={self.lr_warmup_iters}, lr_wsd_decay_iters={self.lr_wsd_decay_iters}. " - f"Found sample fields: lr_decay_samples={self.lr_decay_samples}, lr_warmup_samples={self.lr_warmup_samples}, lr_wsd_decay_samples={self.lr_wsd_decay_samples}. " - f"Use either iteration fields OR sample fields, not both." - ) - - # Validate mutual exclusivity between lr_warmup_fraction and specific warmup fields - if self.lr_warmup_fraction is not None: - assert self.lr_warmup_iters == 0 and self.lr_warmup_samples == 0, ( - f"Cannot specify lr_warmup_fraction={self.lr_warmup_fraction} with lr_warmup_iters={self.lr_warmup_iters} or lr_warmup_samples={self.lr_warmup_samples}. " - f"Use either lr_warmup_fraction OR lr_warmup_iters OR lr_warmup_samples." - ) - - -@dataclass(kw_only=True) -class TrainingConfig(MTrainTrainingConfig): - """Configuration settings related to the training loop and validation.""" - - check_optimizer_step_success: bool = True - """Checks optimizer.step() succeeded at each training step .""" - - skip_sync_grad_norm_across_mp: bool = False - """Skips syncing the grad norm across the model parallel group.""" - - # ---------------- Validation config. ---------------- - - eval_iters: int | None = None - """Number of iterations to run for evaluation validation/test for. Deprecated in favor of ValidationConfig.""" - - eval_interval: int | None = None - """Interval between running evaluation on validation set. Deprecated in favor of ValidationConfig.""" - - skip_train: bool | None = None - """If set, bypass the training loop, optionally do evaluation for validation/test, and exit. Deprecated in favor of ValidationConfig.""" - - def finalize(self) -> None: - """Validate training mode specification and calculate train_iters from train_samples if needed.""" - has_train_iters = self.train_iters is not None - has_train_samples = self.train_samples is not None - - assert has_train_iters or has_train_samples, "Either train_iters or train_samples must be provided" - assert not (has_train_iters and has_train_samples), "Cannot specify both train_iters and train_samples" - if has_train_samples: - assert self.train_samples > 0, "train_samples must be positive" - assert self.rampup_batch_size is None, "Batch size rampup not supported with sample-based training yet" - - # Calculate train_iters from train_samples (rampup_batch_size already validated as None) - self.train_iters = self.train_samples // self.global_batch_size - print_rank_0(f"Setting training iterations to {self.train_iters} based on {self.train_samples} samples") - - -@dataclass(kw_only=True) -class CheckpointConfig(MTrainCheckpointConfig): - """Configuration settings for model checkpointing (saving and loading).""" - - pretrained_checkpoint: Optional[str] = None - """Directory containing a pretrained model checkpoint for finetuning. - - This can be either: - - A parent checkpoint directory (e.g. ``/checkpoints/my_model/``) that - contains tracker files (``latest_train_state.pt``) and ``iter_*`` - subdirectories. - - A specific iteration directory (e.g. - ``/checkpoints/my_model/iter_0001000/``) that directly contains the - checkpoint payload (``run_config.yaml``, weight shards, etc.). - """ - - storage_writers_per_rank: int = 1 - """Number of storage writers per rank for torch_dist checkpoint format. - Affects the number of checkpoint files: saving_ranks * storage_writers_per_rank.""" - - use_persistent_ckpt_worker: bool = True - """Use a persistent background worker for async checkpoint saves. When enabled, creates a dedicated - worker thread/process for handling async saves. When disabled, uses temporal workers that are - created and destroyed for each save operation.""" - - async_strategy: str = "nvrx" - """Async checkpoint strategy to use. Options: ``"nvrx"`` (default) or ``"mcore"``. - The ``"nvrx"`` strategy uses nvidia_resiliency_ext for async checkpointing and falls back - to ``"mcore"`` if the package is not installed.""" - - async_write_results_mp_mode: str = "fork" - """Multiprocessing start method for the async write results queue. - Options: ``"fork"`` (default), ``"spawn"``, ``"forkserver"``.""" - - strict_fsdp_dtensor_load: bool = False - """Whether to enforce strict loading for FSDP DTensor checkpoints. When False, allows partial loading.""" - - custom_manager_class: str | None = None - """Fully qualified class name for a custom CheckpointManager implementation. - - When set, checkpoint operations will instantiate and delegate to this class instead of the default - checkpoint manager. The custom class must implement the `CheckpointManager` protocol - defined in `megatron.bridge.training.checkpointing`. - - Example: ``'mypackage.checkpoint.MyCheckpointManager'`` - """ - - def finalize(self) -> None: - """Post-initialization checks for checkpoint config.""" - if self.pretrained_checkpoint is not None: - from megatron.bridge.training.utils.checkpoint_utils import file_exists - - assert file_exists(self.pretrained_checkpoint), ( - f"Pretrained checkpoint {self.pretrained_checkpoint} does not exist" - ) - - if self.load_main_params_from_ckpt: - assert not self.load_optim, "load_main_params_from_ckpt must be used with load_optim=False" - - if self.async_save: - assert self.save is not None, "async_save is enabled, but save is not set. Set save to a valid path." - assert self.use_persistent_ckpt_worker, "async_save requires use_persistent_ckpt_worker=True." - - # Validate ckpt_step if specified - if self.ckpt_step is not None: - if self.load is None: - raise ValueError( - f"ckpt_step={self.ckpt_step} specified but checkpoint.load is None. " - f"Please set checkpoint.load to the base checkpoint directory." - ) - - if self.dist_ckpt_optim_fully_reshardable: - assert not self.distrib_optim_fully_reshardable_mem_efficient, ( - "distrib_optim_fully_reshardable_mem_efficient requires use_gloo_process_groups" - ) - - -@dataclass(kw_only=True) -class LoggerConfig(MTrainLoggerConfig): - """Configuration settings for logging, including TensorBoard and WandB.""" - - skip_train_metrics_log: bool = False - """Skips logging of training metrics to all logging backends and to the console as well.""" - - timing_log_level: Literal[-1, 0, 1, 2] = 0 - """Granularity level to measure and report timing. - -1: To disable timing logging as the timer start from 0 and above. - 0: report only iteration time and make sure timing does not introduce extra overhead. - 1: report timing for operations that are executed very limited times (basically once) during each iteration - (such as gradient all-reduce) - 2: report timing for operations that migh be executed numerous times during each iteration. - Note that setting the level to 1 or 2 might cause increase in iteration time. - """ - - mlflow_experiment: Optional[str] = None - """The MLFlow experiment name.""" - - mlflow_run_name: Optional[str] = None - """The MLFlow run name.""" - - mlflow_tracking_uri: Optional[str] = None - """Optional MLFlow tracking URI.""" - - mlflow_tags: Optional[dict[str, str]] = None - """Optional tags to apply to the MLFlow run.""" - - comet_project: Optional[str] = None - """The Comet ML project name. Comet logging is disabled when this is None.""" - - comet_experiment_name: Optional[str] = None - """The Comet ML experiment name.""" - - comet_workspace: Optional[str] = None - """The Comet ML workspace. If not set, uses the default workspace for the API key.""" - - comet_api_key: Optional[str] = None - """The Comet ML API key. Can also be set via COMET_API_KEY environment variable.""" - - comet_tags: Optional[list[str]] = None - """Optional list of tags to apply to the Comet ML experiment.""" - - logging_level: int = logging.INFO - """Set default logging level""" - - def finalize(self) -> None: - """Validate logger settings and optional MLFlow dependency.""" - if self.mlflow_experiment and (self.mlflow_run_name is None or self.mlflow_run_name == ""): - raise ValueError("Set logger.mlflow_run_name when enabling MLFlow logging.") - - using_mlflow = any( - [ - self.mlflow_experiment, - self.mlflow_run_name, - self.mlflow_tracking_uri, - self.mlflow_tags, - ] - ) - - if using_mlflow: - try: - import importlib - - importlib.import_module("mlflow") - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "MLFlow logging is configured, but the 'mlflow' package is not installed. " - "Install it via pip install mlflow or uv add mlflow" - ) from exc - - if self.comet_project and (self.comet_experiment_name is None or self.comet_experiment_name == ""): - raise ValueError("Set logger.comet_experiment_name when enabling Comet ML logging.") - - using_comet = any( - [ - self.comet_project, - self.comet_experiment_name, - self.comet_workspace, - self.comet_api_key, - self.comet_tags, - ] - ) - - if using_comet: - try: - import importlib - - importlib.import_module("comet_ml") - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "Comet ML logging is configured, but the 'comet_ml' package is not installed. " - "Install it via pip install comet-ml or uv add comet-ml" - ) from exc - - -@dataclass(kw_only=True) -class ProfilingConfig(MTrainProfilingConfig): - """Configuration settings for profiling the training process.""" - - def finalize(self) -> None: - """Validate profiling configuration.""" - assert not (self.use_pytorch_profiler and self.use_nsys_profiler), ( - "Exactly one of pytorch or nsys profiler should be enabled, not both, when ProfilingConfig is active." - ) - assert self.profile_step_start >= 0, f"profile_step_start must be >= 0, got {self.profile_step_start}" - assert self.profile_step_end >= 0, f"profile_step_end must be >= 0, got {self.profile_step_end}" - assert self.profile_step_end >= self.profile_step_start, ( - f"profile_step_end ({self.profile_step_end}) must be >= profile_step_start ({self.profile_step_start})" - ) - - -@dataclass(kw_only=True) -class TensorInspectConfig: - """Configuration for Nvidia-DL-Framework-Inspect integration.""" - - enabled: bool = False - """Enable tensor inspection and statistics collection.""" - - features: dict[str, Any] | str | Path | None = None - """Feature configuration as a Python dict or a YAML file path.""" - - feature_dirs: list[str] | None = None - """Directories containing feature implementations (searched recursively).""" - - log_dir: str | None = None - """Root directory to store inspection logs/statistics. Defaults to checkpoint save dir if unset.""" - - init_training_step: int = 0 - """Initial training step for the inspector (used when resuming).""" - - def finalize(self) -> None: - """Populate sensible defaults when inspection is enabled. - - - If feature_dirs is unset, default to the installed TransformerEngine - debug features package path (transformer_engine.debug.features), when available. - """ - if not self.enabled: - return - if not self.feature_dirs: - try: - import importlib - - te_features_mod = importlib.import_module("transformer_engine.debug.features") - te_features_dir = Path(te_features_mod.__file__).parent - if te_features_dir.exists(): - self.feature_dirs = [str(te_features_dir)] - except Exception: - pass - - -@dataclass -class FaultToleranceConfig: - """Configuration settings related to fault tolerance mechanisms (NVIDIA internal use).""" - - enable_ft_package: bool = False - """If set, Fault Tolerance package is enabled. Note: This feature is for Nvidia internal use only.""" - - calc_ft_timeouts: bool = False - """If set, FT package will try to automatically compute the timeouts. - Note: This feature is for Nvidia internal use only. - """ - - simulate_fault: bool = False - """Sets a simulated fault for fault tolerance. NOTE: This if for fault tolerance testing only.""" - - simulated_fault_type: Literal["rank_hung", "rank_killed", "random"] = "random" - """How the simulated fault should behave. 'random' will randomly choose one of the other two options.""" - - simulated_fault_rank: Optional[int] = None - """Rank on which simulated fault should occur.""" - - simulated_fault_base_delay: int = 0 - """Base delay before simulated fault thread is started. A small random delay is added to this.""" - - -@dataclass(kw_only=True) -class StragglerDetectionConfig(MTrainStragglerDetectionConfig): - """Configuration settings for detecting and logging GPU stragglers.""" - - enable_straggler_on_startup: bool = True - """If set, StragglerDetector is enabled on startup.""" - - -@dataclass -class NVRxStragglerDetectionConfig: - """Configuration settings for NVIDIA Resiliency Extension straggler detection.""" - - enabled: bool = False - """Enable NVRx straggler detection.""" - - report_time_interval: float = 300.0 - """Interval [seconds] of the straggler check.""" - - calc_relative_gpu_perf: bool = True - """Calculate relative GPU performance scores.""" - - calc_individual_gpu_perf: bool = True - """Calculate individual GPU performance scores.""" - - num_gpu_perf_scores_to_print: int = 5 - """How many best and worst perf scores to print (0 - does not print periodically, - but only if stragglers are detected).""" - - gpu_relative_perf_threshold: float = 0.7 - """Threshold for relative GPU performance scores.""" - - gpu_individual_perf_threshold: float = 0.7 - """Threshold for individual GPU performance scores.""" - - stop_if_detected: bool = False - """Set to True, to terminate the workload if stragglers are detected.""" - - enable_logging: bool = True - """Set to True, to log GPU performance scores.""" - - profiling_interval: int = 1 - """Profiling interval passed to straggler.Detector.initialize.""" - - logger_name: str = "megatron.bridge.NVRxStragglerDetection" - """Logger name for straggler detection messages.""" - - def finalize(self) -> None: - """Validate NVRx straggler detection configuration.""" - if self.enabled: - if not (self.calc_relative_gpu_perf or self.calc_individual_gpu_perf): - raise ValueError( - "At least one of calc_relative_gpu_perf or calc_individual_gpu_perf must be True " - "when NVRx straggler detection is enabled." - ) - if self.report_time_interval <= 0: - raise ValueError("report_time_interval must be positive.") - if not (0.0 <= self.gpu_relative_perf_threshold <= 1.0): - raise ValueError("gpu_relative_perf_threshold must be between 0.0 and 1.0.") - if not (0.0 <= self.gpu_individual_perf_threshold <= 1.0): - raise ValueError("gpu_individual_perf_threshold must be between 0.0 and 1.0.") - - -@dataclass -class InProcessRestartConfig: - """Configuration settings for NVIDIA Resiliency Extension in-process restart functionality.""" - - enabled: bool = False - """Enable in-process restart mechanism from nvidia-resiliency-ext.""" - - max_iterations: Optional[int] = None - """Maximum number of in-process restart iterations.""" - - monitor_thread_interval: float = 1.0 - """Monitoring interval (in seconds) for the monitoring thread.""" - - monitor_process_interval: float = 1.0 - """Monitoring interval (in seconds) for the monitoring process.""" - - progress_watchdog_interval: float = 1.0 - """Interval (in seconds) for automatic progress watchdog timestamp updates.""" - - heartbeat_interval: float = 30.0 - """Monitoring interval (in seconds) for detecting unresponsive ranks.""" - - soft_timeout: float = 60.0 - """Soft progress timeout (in seconds).""" - - hard_timeout: float = 90.0 - """Hard progress timeout (in seconds).""" - - heartbeat_timeout: float = 60.0 - """Timeout (in seconds) for a missing rank detection heartbeat.""" - - barrier_timeout: float = 120.0 - """Timeout (in seconds) for internal distributed barrier.""" - - completion_timeout: float = 120.0 - """Timeout (in seconds) for barrier on completion on all ranks.""" - - last_call_wait: float = 1.0 - """Time interval (in seconds) for other ranks to report concurrent terminal failures.""" - - termination_grace_time: float = 1.0 - """Interval (in seconds) between SIGTERM and SIGKILL issued on hard timeout.""" - - granularity: Literal["node", "rank"] = "node" - """Granularity for in-process restart.""" - - active_world_size: Optional[int] = None - """The number of ranks initially executing the workload. - The remaining ranks from the allocation are set aside as warm reserve. - If None, defaults to WORLD_SIZE environment variable.""" - - empty_cuda_cache: bool = True - """Empty CUDA cache during restart finalization.""" - - max_rank_faults: Optional[int] = None - """Maximum number of rank faults allowed before terminating the job.""" - - monitor_process_logdir: Optional[str] = None - """Directory for monitor process log files. If None, monitor process logging is disabled.""" - - -# ---------------- Container config (standalone top-level config) ---------------- -@dataclass(kw_only=True) -class ConfigContainer(Container): - """Top-level container holding all configuration objects.""" - - rng: RNGConfig = field(default_factory=RNGConfig) - rerun_state_machine: RerunStateMachineConfig = field(default_factory=RerunStateMachineConfig) - train: TrainingConfig - model: ( - GPTModelProvider | T5ModelProvider | MambaModelProvider | MimoModelProvider | GPTModelConfig | MambaModelConfig - ) - optimizer: OptimizerConfig - optimizer_config_override_provider: OptimizerConfigOverrideProvider = field( - default_factory=OptimizerConfigOverrideProvider - ) - ddp: DistributedDataParallelConfig = field(default_factory=DistributedDataParallelConfig) - validation: ValidationConfig = field(default_factory=ValidationConfig) - scheduler: SchedulerConfig - dataset: GPTDatasetConfig | FinetuningDatasetConfig | DatasetProvider - logger: LoggerConfig - tokenizer: TokenizerConfig - checkpoint: CheckpointConfig - dist: DistributedInitConfig = field(default_factory=DistributedInitConfig) - ft: Optional[FaultToleranceConfig] = None - straggler: Optional[StragglerDetectionConfig] = None - nvrx_straggler: Optional[NVRxStragglerDetectionConfig] = None - profiling: ProfilingConfig = field(default_factory=ProfilingConfig) - peft: Optional[PEFT] = None - comm_overlap: Optional[CommOverlapConfig] = None - mixed_precision: Optional[Union[MixedPrecisionConfig, str]] = None - tensor_inspect: TensorInspectConfig | None = None - inprocess_restart: Optional[InProcessRestartConfig] = None - - def get_data_parallel_size(self, world_size: int) -> int: - """Calculate the data parallel size based on the model configuration.""" - model_cfg = self.model - total_model_size = ( - model_cfg.tensor_model_parallel_size - * model_cfg.pipeline_model_parallel_size - * model_cfg.context_parallel_size - ) - assert world_size % total_model_size == 0, f""" - world size ({world_size}) is not divisible by total_model_size ({model_cfg.tensor_model_parallel_size=} * {model_cfg.pipeline_model_parallel_size=} * {model_cfg.context_parallel_size=}) - """ - return world_size // total_model_size - - def set_data_parallel_size(self) -> None: - """Calculate and set data_parallel_size for this config and comm_overlap config. - - This method calculates the data parallel size needed by setup methods, without - triggering full validation or finalization of Megatron Core configs. - """ - # Calculate data parallel size (needed for comm overlap setup) - world_size = get_world_size_safe() - self.data_parallel_size = self.get_data_parallel_size(world_size) - - # Set data_parallel_size on comm_overlap config if present - if self.comm_overlap is not None: - self.comm_overlap.data_parallel_size = self.data_parallel_size - - def _validate_and_apply_deterministic_mode(self) -> None: - """Apply and validate deterministic mode requirements. - - This enforces restrictions and settings that must hold when - the model is configured to run in deterministic mode. - """ - if not getattr(self.model, "deterministic_mode", False): - return - - # Disallow flash attention when running deterministically - if getattr(self.model, "attention_backend", None) == AttnBackend.flash: - raise AssertionError("Flash attention can not be used in deterministic mode.") - - # Disallow cross-entropy loss fusion as it is not deterministic - assert not getattr(self.model, "cross_entropy_loss_fusion", False), ( - "Cross Entropy Fusion is currently not deterministic." - ) - - all_reduce_choices = ("Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS") - assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, ( - f"NCCL_ALGO must be one of {all_reduce_choices}." - ) - - # Enable deterministic algorithms in torch - torch.use_deterministic_algorithms(True) - - def validate(self) -> None: - """Performs validation checks on the combined configuration. - - Calculates dependent values like data_parallel_size and scheduler steps. - Ensures compatibility between different configuration settings. - """ - - # Propagate in-batch packing flag to model config so TransformerConfig.finalize() - # can enable variable_seq_lengths for pipeline parallelism. - if getattr(self.dataset, "pack_sequences_in_batch", False): - self.model._pack_sequences_in_batch = True - - if hasattr(self.dataset, "finalize"): - self.dataset.finalize() - if hasattr(self.ddp, "finalize"): - self.ddp.finalize() - if hasattr(self.optimizer, "finalize"): - self.optimizer.finalize() - if hasattr(self.model, "finalize"): - self.model.finalize() - - self.logger.finalize() - self.train.finalize() - self.scheduler.finalize() - self.checkpoint.finalize() - if self.profiling is not None: - self.profiling.finalize() - if self.nvrx_straggler is not None: - self.nvrx_straggler.finalize() - if self.tensor_inspect is not None: - self.tensor_inspect.finalize() - - # Sync config. If TE RNG tracker is set in either ways, set them in both places. - if self.rng.te_rng_tracker or self.model.use_te_rng_tracker: - self.model.use_te_rng_tracker = self.rng.te_rng_tracker = True - - # Re-run post-inits of sub-configs - for f in fields(self): - sub_cfg = getattr(self, f.name) - if hasattr(sub_cfg, "__post_init__") and not hasattr(sub_cfg, "finalize"): - sub_cfg.__post_init__() - - # Distributed - ensure data_parallel_size is calculated (might already be set by set_data_parallel_size) - if not hasattr(self, "data_parallel_size") or self.data_parallel_size is None: - world_size = get_world_size_safe() - self.data_parallel_size = self.get_data_parallel_size(world_size) - # Set data_parallel_size on comm_overlap config if present - if self.comm_overlap is not None: - self.comm_overlap.data_parallel_size = self.data_parallel_size - - # Deterministic mode validations and settings - self._validate_and_apply_deterministic_mode() - - # Run validations - _validate_and_sync_distributed_optimizer_settings(self) - _validate_mixed_precision_consistency(self) - _validate_fine_grained_activation_offloading(self) - - # CUDA graph scope validation: check_for_nan_in_loss must be disabled with full_iteration graph - if self.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in self.model.cuda_graph_scope: - assert not self.rerun_state_machine.check_for_nan_in_loss, ( - "check_for_nan_in_loss must be disabled when using full_iteration CUDA graph. " - "Set rerun_state_machine.check_for_nan_in_loss=False." - ) - if self.model.cuda_graph_impl == "none": - self.model.cuda_graph_scope = [] - - if self.dist.use_megatron_fsdp and self.dist.use_torch_fsdp2: - raise ValueError("Using use_megatron_fsdp and use_torch_fsdp2 at the same time is not supported.") - - # Megatron FSDP Config checks - if self.dist.use_megatron_fsdp or self.ddp.use_megatron_fsdp: - # Set Megatron FSDP Configs - self.dist.use_megatron_fsdp = True - self.ddp.use_megatron_fsdp = True - - assert not self.dist.use_tp_pp_dp_mapping, "use_tp_pp_dp_mapping is not supported with Megatron FSDP" - - if self.checkpoint.save is not None or self.checkpoint.load is not None: - # only check if saving or loading - assert self.checkpoint.ckpt_format == "fsdp_dtensor", ( - "Megatron FSDP only supports fsdp_dtensor checkpoint format" - ) - - if self.ddp.average_in_collective and not self.ddp.disable_symmetric_registration: - print_rank_0( - "average_in_collective is not supported with NCCL symmetric registration, setting to False" - ) - self.ddp.average_in_collective = False - - # reuse_grad_buf_for_mxfp8_param_ag is not supported with Megatron FSDP - if self.ddp.reuse_grad_buf_for_mxfp8_param_ag: - print_rank_0("reuse_grad_buf_for_mxfp8_param_ag is not supported with Megatron FSDP, setting to False") - self.ddp.reuse_grad_buf_for_mxfp8_param_ag = False - if self.optimizer.reuse_grad_buf_for_mxfp8_param_ag: - self.optimizer.reuse_grad_buf_for_mxfp8_param_ag = False - - # ModelOpt/Quantization checks - if getattr(self.model, "restore_modelopt_state", False): - assert not self.model.gradient_accumulation_fusion, ( - "Gradient accumulation fusion is not supported with ModelOpt/Quantized models. " - "Please set model.gradient_accumulation_fusion=False" - ) - - # Checkpoint - if self.checkpoint.save is not None or self.checkpoint.load is not None: - # only check if saving or loading - if self.checkpoint.ckpt_format == "fsdp_dtensor": - assert self.ddp.use_megatron_fsdp and not self.dist.use_torch_fsdp2, ( - "fsdp_dtensor checkpoint format only supports Megatron FSDP" - ) - - # Enforce async_save format restriction - if self.checkpoint.async_save: - assert self.checkpoint.ckpt_format == "torch_dist", ( - "async_save is only supported with ckpt_format='torch_dist'" - ) - - # Set defaults for tensor inspect callback - if self.tensor_inspect is not None and self.tensor_inspect.enabled: - if self.tensor_inspect.log_dir is None: - self.tensor_inspect.log_dir = self.checkpoint.save or "." - if self.tensor_inspect.init_training_step == 0 and self.checkpoint.ckpt_step is not None: - self.tensor_inspect.init_training_step = int(self.checkpoint.ckpt_step) - - self.model.use_cpu_initialization = self.model.use_cpu_initialization or self.dist.lazy_mpu_init - - # Gloo process groups are not supported when using decentralized process groups (NCCL only). - if self.dist.use_decentralized_pg: - assert not self.dist.use_gloo_process_groups, ( - "Gloo process groups are not supported when use_decentralized_pg=True. " - "Decentralized process groups only support NCCL backend." - ) - - # Make sure all functionality that requires Gloo process groups is disabled. - if not self.dist.use_gloo_process_groups: - if self.optimizer.use_distributed_optimizer: - # If using distributed optimizer, must use distributed checkpointing. - # Legacy checkpointing uses Gloo process groups to collect full distributed - # optimizer state in the CPU memory of DP rank 0. - assert self.checkpoint.ckpt_format == "torch_dist" - - # Cross-validation between training and scheduler configs - self._validate_training_scheduler_compatibility() - - # Calculate scheduler steps for both iteration-based and sample-based training - self._calculate_scheduler_steps() - - if self.model.context_parallel_size > 1: - assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, ( - "Sequence length must be divisible by 2 * context parallel size if context parallel is used." - ) - if isinstance(self.dataset, FinetuningDatasetConfig): - # check calculate_per_token_loss to be True - # check average_in_collective to be False - # for context parallel to solve the issue of nan loss on ranks with all tokens masked - # (only happens in SFT) - assert self.model.calculate_per_token_loss, ( - "When finetuning with CP>1, calculate_per_token_loss must be True" - ) - assert not self.ddp.average_in_collective, ( - "When finetuning with CP>1, average_in_collective must be False" - ) - - self._validate_cp_comm_type() - - if ( - isinstance(self.dataset, FinetuningDatasetConfig) - and self.dataset.packed_sequence_specs is not None - and self.dataset.packed_sequence_specs.packed_sequence_size > 0 - and self.train.micro_batch_size > 1 - ): - packed_sequence_size = self.dataset.packed_sequence_specs.packed_sequence_size - raise ValueError( - "Micro batch size should be 1 when training with packed sequence, but your micro batch size " - f"is {self.train.micro_batch_size}. \nThe following config is equivalent to your current setting for " - f"a packed dataset. Please update your config to the following: \n" - f"Set micro batch size to 1 (currently {self.train.micro_batch_size})\n" - f"Set global batch size to {self.train.global_batch_size // self.train.micro_batch_size} " - f"(currently {self.train.global_batch_size}) \n" - f"Set packed sequence length to {packed_sequence_size * self.train.micro_batch_size} " - f"(currently {packed_sequence_size}) \n" - f"For details please visit " - f"https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html" - ) - - if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1: - raise ValueError( - "micro_batch_size should be greater than 1 when using pack_sequences_in_batch=True. " - "In-batch packing concatenates multiple sequences within a microbatch, so at least 2 sequences " - "are required per micro-batch." - ) - - if self.peft is not None: - assert self.checkpoint.pretrained_checkpoint is not None, "PEFT requires a pretrained checkpoint path" - - if self.dataset is not None: - # Only validate sequence length for GPTDatasetConfig or FinetuningDatasetConfig - # DatasetProvider instances may not have sequence_length attributes - if isinstance(self.dataset, (GPTDatasetConfig, FinetuningDatasetConfig)): - data_seq_length = ( - self.dataset.seq_length - if isinstance(self.dataset, FinetuningDatasetConfig) - else self.dataset.seq_length - ) - - assert self.model.seq_length == data_seq_length, ( - f"Please ensure sequence length configuration in model config and " - f"dataset config match.\nSequence length in model config: {self.model.seq_length}, " - f"Sequence length in dataset config: {data_seq_length}" - ) - - # Validate DeepEP or HybridEP is supported for the current GPU architecture - if isinstance(self.model, (GPTModelConfig, MambaModelConfig)): - validate_flex_dispatcher_backend(self.model.transformer) - else: - validate_flex_dispatcher_backend(self.model) - - for f in fields(ValidationConfig): - train_val = getattr(self.train, f.name, None) - if train_val is not None: - warnings.warn( - f"TrainingConfig.{f.name} is deprecated and will be removed in a future release. Use ValidationConfig.{f.name} instead.", - stacklevel=2, - ) - setattr(self.validation, f.name, train_val) - - def _validate_cp_comm_type(self) -> None: - """Validate cp_comm_type and hierarchical_context_parallel_sizes consistency.""" - cp_comm_type = getattr(self.model, "cp_comm_type", None) - hcp_sizes = getattr(self.model, "hierarchical_context_parallel_sizes", None) - cp_size = getattr(self.model, "context_parallel_size", 1) - - if cp_size > 1 and cp_comm_type is not None: - if isinstance(cp_comm_type, list): - assert len(cp_comm_type) == self.model.num_layers, ( - f"Length of cp_comm_type ({len(cp_comm_type)}) must equal num_layers ({self.model.num_layers})." - ) - else: - assert isinstance(cp_comm_type, str), ( - f"cp_comm_type must be a str or list of str, got {type(cp_comm_type)}." - ) - - cp_comm_types = cp_comm_type if isinstance(cp_comm_type, list) else [cp_comm_type or "p2p"] - if any("a2a+p2p" in ct for ct in cp_comm_types): - assert hcp_sizes is not None, ( - "hierarchical_context_parallel_sizes must be set when cp_comm_type " - "contains 'a2a+p2p'. Without it, CP communication is silently disabled " - "and each rank attends only to its local chunk, producing artificially " - "high throughput but broken training. Example: for cp=16 across 4 nodes " - "of 8 GPUs, set hierarchical_context_parallel_sizes=[8, 2]." - ) - - if hcp_sizes is not None: - from math import prod - - assert prod(hcp_sizes) == cp_size, ( - f"Product of hierarchical_context_parallel_sizes {hcp_sizes} " - f"(={prod(hcp_sizes)}) must equal context_parallel_size (={cp_size})." - ) - - def _validate_training_scheduler_compatibility(self) -> None: - """Cross-validation between training and scheduler configs.""" - has_train_samples = self.train.train_samples is not None - - if has_train_samples: - # Sample-based training validation - assert self.scheduler.lr_decay_iters is None, ( - "Use lr_decay_samples for sample-based training, not lr_decay_iters" - ) - assert self.scheduler.lr_warmup_iters == 0, ( - "Use lr_warmup_samples for sample-based training, not lr_warmup_iters" - ) - assert not (self.scheduler.lr_warmup_fraction is not None and self.scheduler.lr_warmup_samples != 0), ( - "Can only specify one of lr_warmup_fraction or lr_warmup_samples" - ) - else: - # Iteration-based training validation - assert self.scheduler.lr_decay_samples is None, ( - "Use lr_decay_iters for iteration-based training, not lr_decay_samples" - ) - assert self.scheduler.lr_warmup_samples == 0, ( - "Use lr_warmup_iters for iteration-based training, not lr_warmup_samples" - ) - assert not (self.scheduler.lr_warmup_fraction is not None and self.scheduler.lr_warmup_iters != 0), ( - "Can only specify one of lr_warmup_fraction or lr_warmup_iters" - ) - - def _calculate_scheduler_steps(self) -> None: - """Calculate scheduler steps for both iteration-based and sample-based training.""" - is_sample_based = self.train.train_samples is not None - - if is_sample_based: - if self.scheduler.lr_decay_samples is None: - self.scheduler.lr_decay_samples = self.train.train_samples - self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples - self.scheduler.wd_incr_steps = self.train.train_samples - - if self.scheduler.lr_wsd_decay_samples is not None: - self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_samples - - # Warmup calculation for sample-based training - if self.scheduler.lr_warmup_fraction is not None: - self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_fraction * self.scheduler.lr_decay_steps - else: - self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_samples - else: - # Iteration-based training - if self.scheduler.lr_decay_iters is None: - self.scheduler.lr_decay_iters = self.train.train_iters - if self.scheduler.lr_wsd_decay_iters is None and self.scheduler.lr_decay_style == "WSD": - self.scheduler.lr_wsd_decay_iters = self.scheduler.lr_decay_iters - self.scheduler.lr_decay_steps = self.scheduler.lr_decay_iters * self.train.global_batch_size - self.scheduler.wd_incr_steps = self.train.train_iters * self.train.global_batch_size - - if self.scheduler.lr_wsd_decay_iters is not None: - self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_iters * self.train.global_batch_size - - if self.scheduler.lr_warmup_fraction is not None: - self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_fraction * self.scheduler.lr_decay_steps - else: - self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_iters * self.train.global_batch_size - - # Enforce the Megatron Core invariant: lr_warmup_steps must be < lr_decay_steps. - # This can be violated when train_iters is small (e.g. smoke runs) while - # lr_warmup_iters is tuned for a full-length training run. - if self.scheduler.lr_decay_steps <= 0: - raise ValueError( - f"lr_decay_steps must be > 0, got {self.scheduler.lr_decay_steps}. " - "Please increase train_iters/train_samples or lr_decay_iters/lr_decay_samples." - ) - if self.scheduler.lr_warmup_steps >= self.scheduler.lr_decay_steps: - capped = self.scheduler.lr_decay_steps - 1 - warnings.warn( - f"lr_warmup_steps ({self.scheduler.lr_warmup_steps}) >= lr_decay_steps " - f"({self.scheduler.lr_decay_steps}); capping lr_warmup_steps to {capped}. " - "Reduce lr_warmup_iters (or lr_warmup_samples) for short training runs.", - UserWarning, - stacklevel=2, - ) - self.scheduler.lr_warmup_steps = capped - - def log_non_default_values(self) -> None: - """Log configuration values that differ from Megatron Core defaults. - - For configs that inherit from Megatron Core (e.g., OptimizerConfig, DDPConfig, - TransformerConfig), this method logs only the values that differ from the Mcore - defaults. This makes it easier to spot unintended deviations from baseline settings. - - For configs that don't inherit from Mcore, key values are logged via - `_get_key_config_values`, which excludes None values and callables. - """ - if isinstance(self.model, (GPTModelConfig, MambaModelConfig)): - transformer_cfg = self.model.transformer - else: - transformer_cfg = self.model - # Determine the correct Mcore parent class for the model config - # Some models (e.g., DeepSeek) use MLATransformerConfig instead of TransformerConfig - model_mcore_class = _get_mcore_transformer_parent(transformer_cfg) - - # Map of config names to their (config object, Mcore parent class or None) - mcore_configs = [ - ("optimizer", self.optimizer, MCoreOptimizerConfig), - ("ddp", self.ddp, MCoreDistributedDataParallelConfig), - ("model", transformer_cfg, model_mcore_class), - ] - - # Non-Mcore configs - log all values - non_mcore_configs = [ - ("train", self.train), - ("validation", self.validation), - ("scheduler", self.scheduler), - ("dataset", self.dataset), - ("checkpoint", self.checkpoint), - ("logger", self.logger), - ("tokenizer", self.tokenizer), - ("rng", self.rng), - ] - - log_lines = [""] - log_lines.append("=" * 70) - log_lines.append("Configuration Summary (Non-Default Values vs Megatron Core)") - log_lines.append("=" * 70) - - # Log non-default values for Mcore configs - for config_name, config_obj, mcore_class in mcore_configs: - non_defaults = _get_non_default_values(config_obj, mcore_class) - if non_defaults: - log_lines.append(f"\n[{config_name}] Non-default values (vs Mcore {mcore_class.__name__}):") - for field_name, (current_val, default_val) in sorted(non_defaults.items()): - log_lines.append(f" {field_name}: {current_val!r} (Mcore default: {default_val!r})") - - # Log key values for non-Mcore configs - log_lines.append("\n" + "-" * 70) - log_lines.append("Other Configuration Values:") - log_lines.append("-" * 70) - - for config_name, config_obj in non_mcore_configs: - if config_obj is None: - continue - key_values = _get_key_config_values(config_obj) - if key_values: - log_lines.append(f"\n[{config_name}]:") - for field_name, value in sorted(key_values.items()): - log_lines.append(f" {field_name}: {value!r}") - - log_lines.append("\n" + "=" * 70) - - print_rank_0("\n".join(log_lines)) - - -def _get_mcore_transformer_parent(model_config: Any) -> type: - """Determine the correct Mcore TransformerConfig parent class for a model. - - Some models (e.g., DeepSeek v2/v3) inherit from MLATransformerConfig instead of - the base TransformerConfig. This function checks the inheritance chain to find - the appropriate Mcore class to use as the baseline for comparison. - - Args: - model_config: The model configuration object. - - Returns: - The appropriate Mcore TransformerConfig class (MCoreMLATransformerConfig or - MCoreTransformerConfig). - """ - # Check if the model inherits from MLATransformerConfig - if isinstance(model_config, MCoreMLATransformerConfig): - return MCoreMLATransformerConfig - return MCoreTransformerConfig - - -def _get_non_default_values(config_obj: Any, mcore_class: type) -> Dict[str, Tuple[Any, Any]]: - """Get values that differ from Mcore parent class defaults. - - Args: - config_obj: The config object to compare. - mcore_class: The Megatron Core parent class to compare against. - - Returns: - Dictionary mapping field name to (current_value, default_value) for non-default fields. - """ - non_defaults = {} - - # Get default values from Mcore class - mcore_defaults = {} - for f in fields(mcore_class): - if f.name.startswith("_"): - continue - if f.default is not MISSING: - mcore_defaults[f.name] = f.default - elif f.default_factory is not MISSING: - mcore_defaults[f.name] = f.default_factory() - - # Compare current values against Mcore defaults - for f in fields(config_obj): - if f.name.startswith("_"): - continue - field_name = f.name - current_value = getattr(config_obj, field_name, None) - - if field_name in mcore_defaults: - default_value = mcore_defaults[field_name] - # Skip callable values (like functions) and complex objects - if callable(current_value) or callable(default_value): - continue - # Compare values - try: - if current_value != default_value: - non_defaults[field_name] = (current_value, default_value) - except (TypeError, ValueError): - # Some types may not be directly comparable (e.g., torch.dtype) - if str(current_value) != str(default_value): - non_defaults[field_name] = (current_value, default_value) - - return non_defaults - - -def _get_key_config_values(config_obj: Any) -> Dict[str, Any]: - """Get key configuration values for non-Mcore configs. - - Args: - config_obj: The config object to extract values from. - - Returns: - Dictionary mapping field name to value for key fields. - """ - values = {} - if not hasattr(config_obj, "__dataclass_fields__"): - return values - - for f in fields(config_obj): - if f.name.startswith("_"): - continue - value = getattr(config_obj, f.name, None) - # Skip None values and complex objects - if value is None: - continue - if callable(value): - continue - values[f.name] = value - - return values - - -def runtime_config_update(cfg: ConfigContainer) -> None: - """Apply runtime configuration updates prior to initialization. - - This function handles all configuration modifications that need to happen - after initial config creation but before final validation and model setup. - - Steps: - 1. Resolve mixed precision configuration from string if needed - 2. Apply mixed precision settings to model, optimizer, and DDP configs - 3. Calculate data parallel size (needed for comm overlap) - 4. Apply communication overlap configuration - 5. Validate configuration after all modifications - - Args: - cfg: Configuration container to update - """ - # Apply mixed precision configuration if provided - if cfg.mixed_precision is not None: - if isinstance(cfg.mixed_precision, str): - cfg.mixed_precision = get_mixed_precision_config(cfg.mixed_precision) - cfg.mixed_precision.finalize() - cfg.mixed_precision.setup(cfg.model, cfg.optimizer, cfg.ddp) - - # Calculate data parallel size (needed for comm overlap methods) - cfg.set_data_parallel_size() - - # Apply communication overlap configuration if provided - if cfg.comm_overlap is not None: - cfg.comm_overlap.finalize() - cfg.comm_overlap.setup(cfg.model, cfg.optimizer, cfg.ddp) - - # Validate configuration after all modifications - cfg.validate() - - -def mimo_runtime_config_update(cfg: ConfigContainer) -> None: - """MIMO-equivalent of ``runtime_config_update``. - - The standard ``runtime_config_update`` cannot be used directly because it - accesses ``cfg.model`` attributes (``bf16``, ``tensor_model_parallel_size``, - ``cuda_graph_impl``, …) that do not exist on ``MimoModelProvider``. - - This function cherry-picks the safe, model-agnostic parts: - - Keeps (safe for MIMO): - - ``data_parallel_size = 1`` (MIMO-specific hard-code) - - Sub-config finalization (optimizer, ddp, logger, train, scheduler, checkpoint) - - Distributed optimizer sync validation - - Deterministic mode validation - - Skips (would crash or is N/A): - - Mixed precision resolution (per-module, not container-level) - - Communication overlap setup (not supported for MIMO) - - Model-level validations (FSDP, CUDA graphs, TE RNG tracker sync, etc.) - - See ``playground/runtime_config_update_analysis.md`` for the full analysis. - """ - # MIMO: data_parallel_size is always 1 from the training loop's perspective. - cfg.data_parallel_size = 1 - - # Finalize sub-configs that don't depend on model construction order. - # NOTE: cfg.model.finalize() is NOT called here — it validates parallelism - # config and is called inside setup_mimo() right before build_infra(). - if hasattr(cfg.optimizer, "finalize"): - cfg.optimizer.finalize() - if hasattr(cfg.ddp, "finalize"): - cfg.ddp.finalize() - cfg.logger.finalize() - cfg.train.finalize() - cfg.scheduler.finalize() - cfg.checkpoint.finalize() - - # Safe validations - _validate_and_sync_distributed_optimizer_settings(cfg) - cfg._validate_and_apply_deterministic_mode() - - -def _validate_and_sync_distributed_optimizer_settings(config: ConfigContainer) -> None: - """Validate and synchronize distributed optimizer settings between DDP and optimizer configs. - - This function ensures that distributed optimizer settings are consistent across - DDP and optimizer configurations. If either setting is enabled, both will be - enabled to maintain consistency. - - Args: - config: The configuration container to validate and potentially modify. - """ - ddp_setting = config.ddp.use_distributed_optimizer - optimizer_setting = config.optimizer.use_distributed_optimizer - - if ddp_setting or optimizer_setting: - if ddp_setting != optimizer_setting: - warn_rank_0( - f"Distributed optimizer settings were not in sync: " - f"ddp.use_distributed_optimizer={ddp_setting}, " - f"optimizer.use_distributed_optimizer={optimizer_setting}. " - f"Automatically enabling distributed optimizer for both settings." - ) - config.ddp.use_distributed_optimizer = True - config.optimizer.use_distributed_optimizer = True - - -def _validate_mixed_precision_consistency(config: ConfigContainer) -> None: - """Validate that mixed precision settings are consistent between model and optimizer configs. - - Args: - config: The configuration container to validate. - - Raises: - AssertionError: If precision settings are inconsistent in a way that would - indicate ambiguous behavior. - """ - model_cfg = config.model - optimizer_cfg = config.optimizer - - # Mutually exclusive: cannot have both bf16 and fp16 enabled - assert not (model_cfg.bf16 and model_cfg.fp16), ( - "Model config cannot have both bf16=True and fp16=True. Please set only one precision mode." - ) - assert not (optimizer_cfg.bf16 and optimizer_cfg.fp16), ( - "Optimizer config cannot have both bf16=True and fp16=True. Please set only one precision mode." - ) - - # Validate across model and optimizer configs - if optimizer_cfg.use_precision_aware_optimizer: - # For bf16 training: optimizer.bf16 must match model.bf16 - if model_cfg.bf16: - assert optimizer_cfg.bf16, ( - "optimizer.bf16=True must be set when model.bf16=True and use_precision_aware_optimizer=True." - ) - # For fp16 training: optimizer.fp16 must match model.fp16 - if model_cfg.fp16: - assert optimizer_cfg.fp16, ( - "optimizer.fp16=True must be set when model.fp16=True and use_precision_aware_optimizer=True." - ) - # For fp32 training (neither bf16 nor fp16 on model) - if not model_cfg.bf16 and not model_cfg.fp16: - assert not optimizer_cfg.bf16 and not optimizer_cfg.fp16, ( - "optimizer.bf16 and optimizer.fp16 must both be False when " - "model is using fp32 precision (model.bf16=False, model.fp16=False) and " - "use_precision_aware_optimizer=True." - ) - - -def _validate_fine_grained_activation_offloading(config: ConfigContainer) -> None: - """Validate fine-grained activation offloading configuration. - - This function ensures that fine-grained activation offloading is only enabled - with compatible configurations (transformer_engine implementation) and that - necessary environment variables are set for newer TE versions. - - Args: - config: The configuration container to validate. - - Raises: - ValueError: If fine-grained activation offloading is enabled with incompatible settings. - """ - from megatron.core.utils import is_te_min_version - - model_cfg = config.model - - if not model_cfg.fine_grained_activation_offloading: - return - - # Fine-grained activation offloading requires transformer_engine implementation - if model_cfg.transformer_impl != "transformer_engine": - raise ValueError( - "Fine-grained activation offloading is only supported with transformer_engine implementation. " - f"Current transformer_impl: {model_cfg.transformer_impl}" - ) - - # For TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 must be set to avoid offloading weights - if is_te_min_version("2.10.0"): - if os.getenv("NVTE_CPU_OFFLOAD_V1", "0") != "1": - raise ValueError( - "For fine-grained activation offloading with TE >= 2.10.0, " - "NVTE_CPU_OFFLOAD_V1 environment variable should be set to 1 to avoid offloading weights." - ) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/comm_overlap.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from dataclasses import asdict, dataclass, fields -from typing import Optional - -from megatron.core.distributed import DistributedDataParallelConfig -from megatron.core.optimizer import OptimizerConfig -from megatron.core.transformer.enums import CudaGraphScope -from megatron.core.utils import get_te_version, is_te_min_version, is_torch_min_version - -from megatron.bridge.models import GPTModelProvider, T5ModelProvider -from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig -from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig - - -try: - import transformer_engine # type: ignore # noqa: F401 - - HAVE_TE = True -except (ImportError, ModuleNotFoundError): - HAVE_TE = False - - -@dataclass -class TPOverlapCfg: - """Dataclass for linear layer TP overlap config.""" - - pass - - -@dataclass -class PipelineOverlapCfg(TPOverlapCfg): - """Dataclass for pipeline TP overlap config.""" - - num_sm: int - cga_size: int - num_splits: int - set_sm_margin: bool - fp8_buf: bool = (False,) - atomic_gemm: bool = False - method: str = "pipeline" - - -@dataclass -class RingExchangeOverlapCfg(TPOverlapCfg): - """Dataclass for ring exchange TP overlap config.""" - - aggregate: bool = False - method: str = "ring_exchange" - num_sm: int = 1 - cga_size: int = 1 - set_sm_margin: bool = False - fp8_buf: bool = False - atomic_gemm: bool = False - - -@dataclass -class BulkOverlapCfg(TPOverlapCfg): - """Dataclass for bulk TP overlap config.""" - - num_sm: int - cga_size: int - set_sm_margin: bool - method: str = "bulk" - - -@dataclass -class TransformerLayerTPOverlapCfg: - """Dataclass for transformer layer TP overlap config.""" - - qkv_dgrad: TPOverlapCfg - qkv_wgrad: TPOverlapCfg - fc1_dgrad: TPOverlapCfg - fc1_wgrad: TPOverlapCfg - qkv_fprop: TPOverlapCfg - proj_dgrad: TPOverlapCfg - fc1_fprop: TPOverlapCfg - fc2_dgrad: TPOverlapCfg - proj_fprop: TPOverlapCfg - fc2_fprop: TPOverlapCfg - - -# TODO: Add more configs and create a getter function for expose a single api -# Model configs: H100/70B/TP8/MBS1/SeqLen8K -userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True), - fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True), -) - -userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -userbuffers_bf16_b200_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=4, set_sm_margin=True), - fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True), -) - -userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -# llama3.1 405b -userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True), - fc2_fprop=PipelineOverlapCfg(num_sm=8, cga_size=2, num_splits=4, set_sm_margin=True), -) - -userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -userbuffers_bf16_b200_h16384_tp4_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=4, set_sm_margin=True), - fc2_fprop=PipelineOverlapCfg(num_sm=8, cga_size=2, num_splits=4, set_sm_margin=True), -) - -userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -# llama3 70b LoRA -userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_wgrad=None, - fc1_dgrad=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), - fc1_wgrad=None, - qkv_fprop=RingExchangeOverlapCfg(set_sm_margin=True), - proj_dgrad=RingExchangeOverlapCfg(set_sm_margin=True), - fc1_fprop=RingExchangeOverlapCfg(set_sm_margin=True), - fc2_dgrad=RingExchangeOverlapCfg(set_sm_margin=True), - proj_fprop=RingExchangeOverlapCfg(cga_size=2, set_sm_margin=True, fp8_buf=True), - fc2_fprop=RingExchangeOverlapCfg(cga_size=2, set_sm_margin=True, fp8_buf=True), -) - -# llama3.1 405b LoRA -userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - qkv_wgrad=None, - fc1_dgrad=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), - fc1_wgrad=None, - qkv_fprop=RingExchangeOverlapCfg(aggregate=True), - proj_dgrad=RingExchangeOverlapCfg(aggregate=True), - fc1_fprop=RingExchangeOverlapCfg(aggregate=True), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=True), - proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), - fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), -) - -# GPT3 20b -userbuffers_bf16_h100_h6144_tp2_mbs2_seqlen2048 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True), - fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True), -) - -userbuffers_fp8_h100_h6144_tp2_mbs2_seqlen2048 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), - fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True), -) - -# GPT3 175b -userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -userbuffers_fp8_h100_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -userbuffers_bf16_b200_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -userbuffers_fp8_b200_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -# Nemotron 15B -userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - -# Nemotron 340B -userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True), -) - -userbuffers_fp8_b200_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg( - qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False), - fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False), - fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False), - qkv_fprop=RingExchangeOverlapCfg(aggregate=False), - proj_dgrad=RingExchangeOverlapCfg(aggregate=False), - fc1_fprop=RingExchangeOverlapCfg(aggregate=False), - fc2_dgrad=RingExchangeOverlapCfg(aggregate=False), - proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), - fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True), -) - - -@dataclass -class _CommOverlapConfig: - # Tensor parallel communication overlap (experimental) - tp_comm_overlap: bool = None - tp_comm_overlap_cfg: dict = None - tp_comm_bootstrap_backend: str = None - # Pipeline parallel communication overlap - overlap_p2p_comm: bool = None - batch_p2p_comm: bool = None - # Data parallel communication overlap - overlap_grad_reduce: bool = None - overlap_param_gather: bool = None - overlap_param_gather_with_optimizer_step: bool = None - align_param_gather: bool = None - bucket_size: int = None - # Pipeline bubble overlap - defer_embedding_wgrad_compute: bool = None - wgrad_deferral_limit: int = None - # MOE expert parallel comm - overlap_moe_expert_parallel_comm: bool = None - delay_wgrad_compute: bool = None - - -@dataclass(kw_only=True) -class CommOverlapConfig: - """Configuration for communication overlap optimizations in distributed training. - - This class manages tensor parallel, pipeline parallel, and data parallel - communication overlap settings to improve training performance. - """ - - tp_comm_overlap: bool - tp_comm_overlap_cfg: Optional[TransformerLayerTPOverlapCfg] = None - tp_comm_bootstrap_backend: Optional[str] = "nccl" - overlap_p2p_comm: Optional[bool] = None - batch_p2p_comm: Optional[bool] = None - overlap_grad_reduce: Optional[bool] = None - overlap_param_gather: Optional[bool] = None - overlap_param_gather_with_optimizer_step: Optional[bool] = None - align_param_gather: Optional[bool] = None - bucket_size: Optional[int] = None - defer_embedding_wgrad_compute: Optional[bool] = None - wgrad_deferral_limit: Optional[int] = None - data_parallel_size: Optional[int] = None - overlap_moe_expert_parallel_comm: Optional[bool] = None - delay_wgrad_compute: Optional[bool] = None - - def finalize(self): - # Don't recreate the user_comm_overlap_cfg if the post init is re-run - if hasattr(self, "user_comm_overlap_cfg") and self.user_comm_overlap_cfg is not None: - return - - self.user_comm_overlap_cfg = _CommOverlapConfig( - tp_comm_overlap=self.tp_comm_overlap, - tp_comm_overlap_cfg=self.tp_comm_overlap_cfg, - tp_comm_bootstrap_backend=self.tp_comm_bootstrap_backend, - overlap_p2p_comm=self.overlap_p2p_comm, - batch_p2p_comm=self.batch_p2p_comm, - overlap_grad_reduce=self.overlap_grad_reduce, - overlap_param_gather=self.overlap_param_gather, - overlap_param_gather_with_optimizer_step=self.overlap_param_gather_with_optimizer_step, - align_param_gather=self.align_param_gather, - bucket_size=self.bucket_size, - defer_embedding_wgrad_compute=self.defer_embedding_wgrad_compute, - wgrad_deferral_limit=self.wgrad_deferral_limit, - overlap_moe_expert_parallel_comm=self.overlap_moe_expert_parallel_comm, - delay_wgrad_compute=self.delay_wgrad_compute, - ) - - def _get_model_comm_overlap_cfgs( - self, - model_cfg: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, - ddp_config: DistributedDataParallelConfig, - ) -> _CommOverlapConfig: - comm_overlap_cfg = _CommOverlapConfig() - - vp_size = model_cfg.virtual_pipeline_model_parallel_size - if vp_size is None: - vp_size = 1 - - # Optimizations disabled by default, can be overriden by user - comm_overlap_cfg.tp_comm_overlap = False - comm_overlap_cfg.tp_comm_overlap_cfg = None - comm_overlap_cfg.defer_embedding_wgrad_compute = False - comm_overlap_cfg.wgrad_deferral_limit = -1 - comm_overlap_cfg.overlap_moe_expert_parallel_comm = False - comm_overlap_cfg.delay_wgrad_compute = False - - # Check if TP overlap can be safely enabled - if self.user_comm_overlap_cfg.tp_comm_overlap is True: - if model_cfg.tensor_model_parallel_size < 2: - logging.warning("Disabling tensor parallel communication overlap due to TP size < 2.") - self.user_comm_overlap_cfg.tp_comm_overlap = False - elif not model_cfg.sequence_parallel: - logging.warning("Disabling tensor parallel communication overlap due to sequence_parallel=False.") - self.user_comm_overlap_cfg.tp_comm_overlap = False - elif not HAVE_TE: - logging.warning("Disabling tensor parallel communication overlap due to TE not detected.") - self.user_comm_overlap_cfg.tp_comm_overlap = False - - # PP overlap - if model_cfg.pipeline_model_parallel_size > 1: - if vp_size > 1: - comm_overlap_cfg.overlap_p2p_comm = True - comm_overlap_cfg.batch_p2p_comm = False - else: - comm_overlap_cfg.overlap_p2p_comm = False - comm_overlap_cfg.batch_p2p_comm = True - else: - comm_overlap_cfg.overlap_p2p_comm = False - comm_overlap_cfg.batch_p2p_comm = False - - # MOE expert parallel comm overlap - assert hasattr(model_cfg, "overlap_moe_expert_parallel_comm"), ( - f"model_cfg: {model_cfg} does not have overlap_moe_expert_parallel_comm" - ) - - if self.user_comm_overlap_cfg.overlap_moe_expert_parallel_comm is True: - assert model_cfg.expert_model_parallel_size > 1, ( - "overlap_moe_expert_parallel_comm is only supported when expert_model_parallel_size > 1" - ) - assert model_cfg.num_moe_experts > 1, ( - f"overlap_moe_expert_parallel_comm is only supported when num_moe_experts > 1, \ - but got {model_cfg.num_moe_experts}" - ) - assert model_cfg.moe_token_dispatcher_type in ["alltoall", "flex"], ( - f"overlap_moe_expert_parallel_comm is only supported when moe_token_dispatcher_type == 'alltoall' or 'flex',\ - but got {model_cfg.moe_token_dispatcher_type}" - ) - assert model_cfg.bf16 or model_cfg.fp16, ( - "overlap_moe_expert_parallel_comm is only supported when using bf16 or fp16 models" - ) - assert is_torch_min_version("2.6.0"), "A2A Overlap encounters hang issue with torch version < 2.6.0" - if model_cfg.pipeline_model_parallel_size > 1: - assert model_cfg.virtual_pipeline_model_parallel_size is not None, ( - "If enabling EP A2A overlap, virtual_pipeline_model_parallel_size " - "must be specified when pipeline_model_parallel_size > 1" - ) - assert model_cfg.recompute_granularity != "full", ( - "disable full recomputation when enabling overlap_moe_expert_parallel_comm" - ) - assert model_cfg.recompute_method is None, ( - "disable recomputation method when enabling overlap_moe_expert_parallel_comm" - ) - assert model_cfg.recompute_num_layers is None, ( - "recompute_num_layers must be None when enabling overlap_moe_expert_parallel_comm" - ) - assert not model_cfg.moe_shared_expert_overlap, ( - "disable moe_shared_expert_overlap when enabling overlap_moe_expert_parallel_comm" - ) - assert model_cfg.mtp_num_layers is None or model_cfg.mtp_num_layers == 1, ( - "MTP layernum only supports 1 when enabling overlap_moe_expert_parallel_comm." - ) - - if self.user_comm_overlap_cfg.delay_wgrad_compute is True: - if ddp_config.overlap_grad_reduce or self.user_comm_overlap_cfg.overlap_grad_reduce: - assert is_te_min_version("2.7.0"), ( - f"TE version >= 2.7.0 is required for overlap_grad_reduce when using" - f"delay_wgrad_compute. Current TE version: {get_te_version()}" - ) - if model_cfg.gradient_accumulation_fusion is True: - assert is_te_min_version("2.7.0"), ( - f"TE version >= 2.7.0 is required for gradient_accumulation_fusion when using" - f"delay_wgrad_compute. Current TE version: {get_te_version()}" - ) - - assert ( - model_cfg.overlap_moe_expert_parallel_comm - or self.user_comm_overlap_cfg.overlap_moe_expert_parallel_comm - ), "overlap_moe_expert_parallel_comm is required for delay_wgrad_compute" - - # CUDA graph scope-specific validations for delayed wgrad. - cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", []) or [] - if isinstance(cuda_graph_scope, str): - cuda_graph_scope = cuda_graph_scope.split(",") if cuda_graph_scope else [] - elif not isinstance(cuda_graph_scope, list): - cuda_graph_scope = [cuda_graph_scope] - attn_scope_enabled = ( - CudaGraphScope.attn in cuda_graph_scope - or CudaGraphScope.attn.value in cuda_graph_scope - or f"CudaGraphScope.{CudaGraphScope.attn.value}" in cuda_graph_scope - ) - moe_router_scope_enabled = ( - CudaGraphScope.moe_router in cuda_graph_scope - or CudaGraphScope.moe_router.value in cuda_graph_scope - or f"CudaGraphScope.{CudaGraphScope.moe_router.value}" in cuda_graph_scope - ) - wgrad_in_graph_scope = attn_scope_enabled or ( - moe_router_scope_enabled - and getattr(model_cfg, "moe_shared_expert_intermediate_size", None) is not None - and not getattr(model_cfg, "moe_shared_expert_overlap", False) - ) - if wgrad_in_graph_scope: - assert is_te_min_version("2.12.0"), ( - "CUDA graph with delay_wgrad_compute requires TE version >= 2.12.0." - ) - assert model_cfg.gradient_accumulation_fusion, ( - "CUDA graph with delay_wgrad_compute requires gradient_accumulation_fusion " - "to be enabled. This is because default gradient accumulation does not use " - "static memory addresses, which breaks CUDA graph requirements." - ) - if attn_scope_enabled: - assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, ( - "CUDA graph with delay_wgrad_compute does not support attention bias for now." - ) - - # CUDA graph scope-specific validations for delayed wgrad. - cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", None) - if cuda_graph_scope is None or cuda_graph_scope == "full": - cuda_graph_scope = [] - elif isinstance(cuda_graph_scope, (str, CudaGraphScope)): - cuda_graph_scope = [cuda_graph_scope] - attn_scope_enabled = ( - CudaGraphScope.attn in cuda_graph_scope - or CudaGraphScope.attn.value in cuda_graph_scope - or f"CudaGraphScope.{CudaGraphScope.attn.value}" in cuda_graph_scope - ) - moe_router_scope_enabled = ( - CudaGraphScope.moe_router in cuda_graph_scope - or CudaGraphScope.moe_router.value in cuda_graph_scope - or f"CudaGraphScope.{CudaGraphScope.moe_router.value}" in cuda_graph_scope - ) - wgrad_in_graph_scope = attn_scope_enabled or ( - moe_router_scope_enabled - and getattr(model_cfg, "moe_shared_expert_intermediate_size", None) is not None - and not getattr(model_cfg, "moe_shared_expert_overlap", False) - ) - if wgrad_in_graph_scope: - assert is_te_min_version("2.12.0"), ( - "CUDA graph with delay_wgrad_compute requires TE version >= 2.12.0." - ) - assert model_cfg.gradient_accumulation_fusion, ( - "CUDA graph with delay_wgrad_compute requires gradient_accumulation_fusion " - "to be enabled. This is because default gradient accumulation does not use " - "static memory addresses, which breaks CUDA graph requirements." - ) - if attn_scope_enabled: - assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, ( - "CUDA graph with delay_wgrad_compute does not support attention bias for now." - ) - - comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg) - return comm_overlap_cfg - - def _get_optimizer_overlap_cfgs( - self, model_cfg: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig - ) -> _CommOverlapConfig: - vp_size = model_cfg.virtual_pipeline_model_parallel_size - if vp_size is None: - vp_size = 1 - - comm_overlap_cfg = _CommOverlapConfig() - comm_overlap_cfg.bucket_size = None - comm_overlap_cfg.overlap_grad_reduce = False - comm_overlap_cfg.overlap_param_gather = False - comm_overlap_cfg.overlap_param_gather_with_optimizer_step = False - comm_overlap_cfg.align_param_gather = False - - if self.data_parallel_size > 1: - comm_overlap_cfg.bucket_size = 128 * 1024 * 1024 - comm_overlap_cfg.overlap_grad_reduce = True - comm_overlap_cfg.overlap_param_gather = True - if model_cfg.pipeline_model_parallel_size > 1 and vp_size > 1: - # Currently disabled due to an issue with checkpointing - # comm_overlap_cfg.overlap_param_gather_with_optimizer_step = True - comm_overlap_cfg.align_param_gather = True - - comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg) - return comm_overlap_cfg - - def _apply_cfgs(self, src_cfg, dest_cfg): - # apply optimizations into dest_cfg - for field in fields(src_cfg): - if hasattr(dest_cfg, field.name): - setattr(dest_cfg, field.name, getattr(src_cfg, field.name)) - - def _override_user_cfgs(self, comm_overlap_cfg): - # override default configs with any user provided configs - if isinstance(self.user_comm_overlap_cfg, _CommOverlapConfig): - for field in fields(self.user_comm_overlap_cfg): - user_value = getattr(self.user_comm_overlap_cfg, field.name) - if user_value is not None: - setattr(comm_overlap_cfg, field.name, user_value) - - return comm_overlap_cfg - - def setup( - self, - model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, - optimizer_config: OptimizerConfig, - ddp_config: DistributedDataParallelConfig, - ) -> None: - """Set up communication overlap configurations for the model, optimizer, and DDP. - - Args: - model_config: Model configuration containing parallelism settings - optimizer_config: Optimizer configuration for gradient overlap settings - ddp_config: Distributed data parallel configuration - """ - comm_overlap_cfg = self._get_model_comm_overlap_cfgs(model_config, ddp_config) - self._apply_cfgs(comm_overlap_cfg, model_config) - if model_config.tp_comm_overlap: - if comm_overlap_cfg.tp_comm_overlap_cfg is None: - logging.warning( - "Tensor parallel overlap: No overlap config provided. " - "Initializing TP comm overlap with the default config." - ) - model_config.tp_comm_overlap_cfg = None - else: - # ub_cfgs is a dataclass, however TE needs a dict, so convert here - model_config.tp_comm_overlap_cfg = asdict(comm_overlap_cfg.tp_comm_overlap_cfg) - # remove keys with None values from dictionary to match TE's expectations - model_config.tp_comm_overlap_cfg = { - key: value for key, value in model_config.tp_comm_overlap_cfg.items() if value is not None - } - model_config.tp_comm_bootstrap_backend = comm_overlap_cfg.tp_comm_bootstrap_backend - - # Data parallel overlap is only available with the Megatron DDP and Distributed optimizer - if ( - isinstance(optimizer_config, OptimizerConfig) - and isinstance(ddp_config, DistributedDataParallelConfig) - and ddp_config.use_distributed_optimizer - ): - comm_overlap_cfg = self._get_optimizer_overlap_cfgs(model_config) - self._apply_cfgs(comm_overlap_cfg, optimizer_config) - self._apply_cfgs(comm_overlap_cfg, ddp_config) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/initialize.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import datetime -import os -import time -import warnings -from typing import Callable, Optional - -import torch -import torch.distributed -import torch.nn.functional as F -from megatron.core import parallel_state, tensor_parallel -from megatron.core.datasets.utils import compile_helpers -from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train -from megatron.core.fusions.fused_bias_gelu import bias_gelu -from megatron.core.fusions.fused_bias_swiglu import bias_swiglu -from megatron.core.hyper_comm_grid import HyperCommGrid -from megatron.core.num_microbatches_calculator import ( - destroy_num_microbatches_calculator, - init_num_microbatches_calculator, -) -from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler -from megatron.core.utils import ( - configure_nvtx_profiling, - get_pg_rank, - get_te_version, - is_te_min_version, - is_torch_min_version, -) - -from megatron.bridge.models import GPTModelProvider, T5ModelProvider -from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig -from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig -from megatron.bridge.models.transformer_config import TransformerConfig -from megatron.bridge.training.config import ConfigContainer, DistributedInitConfig, RerunStateMachineConfig, RNGConfig -from megatron.bridge.utils.common_utils import ( - get_local_rank_preinit, - get_master_addr_safe, - get_master_port_safe, - get_rank_safe, - get_world_size_safe, -) - - -def initialize_megatron( - cfg: ConfigContainer, - allow_no_cuda: bool = False, - skip_mpu_initialization: bool = False, - get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None, - get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None, - restart_store: Optional[torch.distributed.Store] = None, -) -> Callable[[], None] | ProcessGroupCollection | None: - """Initialize Megatron core components and distributed setup. - - Sets up logging, initializes distributed environment (torch.distributed), - configures microbatch calculator, and sets random seeds. - - Args: - cfg: The main configuration container. - allow_no_cuda: If True, allows initialization without CUDA. - skip_mpu_initialization: If True, skips MPU initialization (for external managers). - get_embedding_ranks: Optional function to determine embedding layer ranks. - get_position_embedding_ranks: Optional function to determine position embedding ranks. - restart_store: Optional store for in-process restart. - - Returns: - An optional callable to finish MPU initialization if lazy_mpu_init is True, - otherwise None. - """ - - if not allow_no_cuda: - # Make sure cuda is available. - assert torch.cuda.is_available(), "Megatron requires CUDA." - - model_config = cfg.model - dist_config = cfg.dist - rng_config = cfg.rng - rerun_state_machine_config = cfg.rerun_state_machine - train_config = cfg.train - use_inprocess_restart = cfg.inprocess_restart is not None and cfg.inprocess_restart.enabled - - # Configure NVTX profiling if requested - if cfg.profiling is not None and cfg.profiling.nvtx_ranges: - configure_nvtx_profiling(enabled=True) - - # Prep for checkpoint conversion. - # if args.ckpt_convert_format is not None: - # assert args.ckpt_convert_save is not None - # assert args.load is not None - # args.exit_on_missing_checkpoint = True - - # TODO (maanug): determine if we want to support this behavior - # if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False): - # assert args.load is not None, "--use-checkpoint-args requires --load argument" - # load_args_from_checkpoint(args) - - init_num_microbatches_calculator( - get_rank_safe(), - train_config.rampup_batch_size, - train_config.global_batch_size, - train_config.micro_batch_size, - cfg.data_parallel_size, - train_config.decrease_batch_size_if_needed, - ) - - # init rerun global state - init_rerun_state(rerun_state_machine_config) - - # torch.distributed initialization - result = torch_dist_init( - model_config=model_config, - dist_config=dist_config, - rng_config=rng_config, - micro_batch_size=train_config.micro_batch_size, - num_distributed_optimizer_instances=cfg.ddp.num_distributed_optimizer_instances, - get_embedding_ranks=get_embedding_ranks, - get_position_embedding_ranks=get_position_embedding_ranks, - skip_mpu_initialization=skip_mpu_initialization, - restart_store=restart_store, - use_inprocess_restart=use_inprocess_restart, - ) - - # Compile dataset helpers after distributed initialization - # Use local rank to ensure each node compiles independently (multi-node without shared filesystem) - if torch.distributed.is_initialized(): - if get_local_rank_preinit() == 0: - start_time = time.time() - print("> compiling dataset index builder ...") - compile_helpers() - print( - ">>> done with dataset index builder. Compilation time: {:.3f} seconds".format( - time.time() - start_time - ), - flush=True, - ) - torch.distributed.barrier() - - return result - - -def torch_dist_init( - model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, - dist_config: DistributedInitConfig, - rng_config: RNGConfig, - micro_batch_size: int, - num_distributed_optimizer_instances: int, - get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]], - get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]], - skip_mpu_initialization: bool, - restart_store: Optional[torch.distributed.Store] = None, - use_inprocess_restart: bool = False, -) -> Callable[[], None] | ProcessGroupCollection | None: - """Initialize torch.distributed and dependent components. - - Handles the core distributed setup, including process group initialization, - MPU (Model Parallel Unit) setup, random seed setting, and optional - compilation/warmup steps. - - Args: - model_config: Configuration for the specific model (GPTConfig or T5Config). - dist_config: Configuration for distributed initialization settings. - rng_config: Configuration for random number generation. - micro_batch_size: The micro batch size for JIT warmup. - num_distributed_optimizer_instances: Number of parallel optimizer instances. - get_embedding_ranks: Optional function to determine embedding layer ranks. - get_position_embedding_ranks: Optional function to determine position embedding ranks. - skip_mpu_initialization: If True, returns a function to finish MPU setup later. - - Returns: - An optional callable to finish MPU initialization if skip_mpu_initialization - or lazy_mpu_init is True, otherwise None. - """ - - def finish_mpu_init() -> ProcessGroupCollection: - # Pytorch distributed. - pg_collection = _initialize_distributed( - model_config=model_config.transformer - if isinstance(model_config, (GPTModelConfig, MambaModelConfig)) - else model_config, - dist_config=dist_config, - num_distributed_optimizer_instances=num_distributed_optimizer_instances, - get_embedding_ranks=get_embedding_ranks, - get_position_embedding_ranks=get_position_embedding_ranks, - restart_store=restart_store, - use_inprocess_restart=use_inprocess_restart, - ) - - # Random seeds for reproducibility. - if get_rank_safe() == 0: - print("> setting random seeds to {} ...".format(rng_config.seed)) - _set_random_seed( - rng_config.seed, - rng_config.data_parallel_random_init, - rng_config.te_rng_tracker, - rng_config.inference_rng_tracker, - use_cudagraphable_rng=(model_config.cuda_graph_impl != "none"), - pg_collection=pg_collection, - ) - - if model_config.num_moe_experts is not None: - MoEAuxLossAutoScaler.set_loss_scale(torch.ones(1, device=torch.cuda.current_device())) - return pg_collection - - if skip_mpu_initialization: - return None - - if dist_config.lazy_mpu_init: - # delayed initialization of DDP-related stuff - # We only set basic DDP globals - parallel_state.set_tensor_model_parallel_world_size(model_config.tensor_model_parallel_size) - # and return function for external DDP manager - # to call when it has DDP initialized - parallel_state.set_tensor_model_parallel_rank(get_rank_safe()) - return finish_mpu_init - # Megatron's MPU is the master. Complete initialization right away. - pg_collection = finish_mpu_init() - - if model_config.tp_comm_overlap: - _initialize_tp_communicators(model_config, micro_batch_size) - - return pg_collection - - -def init_rerun_state(rerun_state_machine_config: RerunStateMachineConfig) -> None: - """Initialize the rerun state machine for result validation or stats. - - Sets up state saving and restoration functions, particularly for RNG trackers. - - Args: - rerun_state_machine_config: Configuration for the rerun state machine. - """ - from megatron.core.rerun_state_machine import ( - RerunDiagnostic, - RerunErrorInjector, - RerunMode, - get_rerun_state_machine, - initialize_rerun_state_machine, - ) - - def state_save_func(): - return {"rng_tracker_states": tensor_parallel.get_cuda_rng_tracker().get_states()} - - def state_restore_func(state_dict): - if state_dict["rng_tracker_states"]: - tensor_parallel.get_cuda_rng_tracker().set_states(state_dict["rng_tracker_states"]) - - initialize_rerun_state_machine( - state_save_func=state_save_func, - state_restore_func=state_restore_func, - mode=RerunMode(rerun_state_machine_config.rerun_mode), - error_injector=RerunErrorInjector( - error_injection_rate=rerun_state_machine_config.error_injection_rate, - error_injection_type=RerunDiagnostic(rerun_state_machine_config.error_injection_type), - ), - ) - - # Store config on the singleton for use in loss validation - rsm = get_rerun_state_machine() - rsm.spiky_loss_factor = rerun_state_machine_config.spiky_loss_factor - - -def set_jit_fusion_options( - model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, micro_batch_size: int -) -> None: - """Set PyTorch JIT layer fusion options and warmup JIT functions. - - Configures the JIT fuser (nvFuser or legacy) based on the PyTorch version - and warms up common fused kernels like bias_gelu and bias_dropout_add. - - Args: - model_config: Configuration for the specific model (GPTConfig or T5Config). - micro_batch_size: The micro batch size used for warmup tensor shapes. - """ - # flags required to enable jit fusion kernels - if is_torch_min_version("2.2.0a0"): - pass # we're using torch.compile for jit fusion - elif is_torch_min_version("1.10.0a0"): - # nvfuser - torch._C._jit_set_profiling_executor(True) - torch._C._jit_set_profiling_mode(True) - torch._C._jit_override_can_fuse_on_cpu(False) - torch._C._jit_override_can_fuse_on_gpu(False) - torch._C._jit_set_texpr_fuser_enabled(False) - torch._C._jit_set_nvfuser_enabled(True) - torch._C._debug_set_autodiff_subgraph_inlining(False) - else: - # legacy pytorch fuser - torch._C._jit_set_profiling_mode(False) - torch._C._jit_set_profiling_executor(False) - torch._C._jit_override_can_fuse_on_cpu(True) - torch._C._jit_override_can_fuse_on_gpu(True) - - _warmup_jit_function( - model_config.transformer if isinstance(model_config, (GPTModelConfig, MambaModelConfig)) else model_config, - micro_batch_size, - ) - - -def destroy_global_state() -> None: - """Destroy Megatron global states. - - Cleans up resources used by microbatch calculator, global memory buffer, - model parallel groups, and the rerun state machine. - """ - from megatron.core.rerun_state_machine import destroy_rerun_state_machine - - destroy_num_microbatches_calculator() - parallel_state.destroy_global_memory_buffer() - parallel_state.destroy_model_parallel() - destroy_rerun_state_machine() - - -def _initialize_tp_communicators( - model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, micro_batch_size: int -) -> None: - """initializing the communicators with user buffers for high-performance tensor-model-parallel - communication overlap""" - - try: - import transformer_engine # noqa: F401 - import yaml - from transformer_engine.pytorch import module as te_module - - except ImportError: - raise RuntimeError( - "Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and 'transformer_engine' packages" - ) - - if model_config.tp_comm_overlap_cfg is not None: - if isinstance(model_config.tp_comm_overlap_cfg, str): - with open(model_config.tp_comm_overlap_cfg, "r") as stream: - ub_cfgs = yaml.safe_load(stream) - else: - ub_cfgs = model_config.tp_comm_overlap_cfg - else: - ub_cfgs = {} - - input_shape = [ - (model_config.seq_length * micro_batch_size) // model_config.context_parallel_size, - model_config.hidden_size, - ] - - if is_te_min_version("2.7.0"): - UserBufferQuantizationMode = te_module.base.UserBufferQuantizationMode - quantization_modes = [UserBufferQuantizationMode.FP8 if model_config.fp8 else UserBufferQuantizationMode.NONE] - if ( - model_config.fp8 is not None - and model_config.first_last_layers_bf16 - and (model_config.num_layers_at_start_in_bf16 > 0 or model_config.num_layers_at_end_in_bf16 > 0) - ): - quantization_modes.append(UserBufferQuantizationMode.NONE) - # The process group with the target bootstrap backend is created in Transformer Engine. - te_module.base.initialize_ub( - shape=input_shape, - tp_size=model_config.tensor_model_parallel_size, - quantization_modes=quantization_modes, - ub_cfgs=ub_cfgs, - bootstrap_backend=model_config.tp_comm_bootstrap_backend, - ) - elif is_te_min_version("1.9.0"): - # The process group with the target bootstrap backend is created in Transformer Engine. - te_module.base.initialize_ub( - shape=input_shape, - tp_size=model_config.tensor_model_parallel_size, - use_fp8=(model_config.fp8 is not None), - ub_cfgs=ub_cfgs, - bootstrap_backend=model_config.tp_comm_bootstrap_backend, - ) - else: - if model_config.tp_comm_bootstrap_backend != "mpi": - warnings.warn(f"Transformer Engine v{get_te_version()} supports only MPI bootstrap backend.") - # Create a MPI process group to help with TP communication overlap bootstrap. - torch.distributed.new_group(backend="mpi") - - te_module.base.initialize_ub( - shape=input_shape, - tp_size=model_config.tensor_model_parallel_size, - use_fp8=(model_config.fp8 is not None), - ub_cfgs=ub_cfgs, - ) - - -def _create_pg_collection( - model_config: TransformerConfig, - num_distributed_optimizer_instances: int, - get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None, - get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None, -) -> ProcessGroupCollection: - """Create all process groups via HyperCommGrid and return a ProcessGroupCollection.""" - hcp_sizes = getattr(model_config, "hierarchical_context_parallel_sizes", None) - if hcp_sizes is not None: - raise NotImplementedError( - "Decentralized process groups (use_decentralized_pg=True) do not support " - "hierarchical_context_parallel_sizes. Use cp_comm_type='a2a' or 'p2p' instead, " - "or set use_decentralized_pg=False to use the MPU path which supports 'a2a+p2p'." - ) - - world_size = torch.distributed.get_world_size() - tp_size = int(model_config.tensor_model_parallel_size) - pp_size = int(model_config.pipeline_model_parallel_size) - cp_size = int(model_config.context_parallel_size) if getattr(model_config, "context_parallel_size", 1) else 1 - model_size = tp_size * pp_size * cp_size - if world_size % model_size != 0: - raise RuntimeError(f"world_size ({world_size}) is not divisible by {model_size}") - dp_size = world_size // model_size - - grid = HyperCommGrid( - shape=[tp_size, cp_size, dp_size, pp_size], - dim_names=["tp", "cp", "dp", "pp"], - rank_offset=0, - backend="nccl", - ) - # Core groups - tp_pg = grid.create_pg(["tp"]) - cp_pg = grid.create_pg(["cp"]) - pp_pg = grid.create_pg(["pp"]) - dp_pg = grid.create_pg(["dp"]) - mp_pg = grid.create_pg(["tp", "pp"]) - tp_cp_pg = grid.create_pg(["tp", "cp"]) - tp_dp_cp_pg = grid.create_pg(["tp", "dp", "cp"]) - dp_cp_pg = grid.create_pg(["dp", "cp"]) - - # Expert/MoE related groups (refer to original parallel_state.initialize_model_parallel) - expert_tp_size = ( - int(model_config.expert_tensor_parallel_size) - if getattr(model_config, "expert_tensor_parallel_size", None) - else tp_size - ) - ep_size = ( - int(model_config.expert_model_parallel_size) if getattr(model_config, "expert_model_parallel_size", 1) else 1 - ) - # Expert data-parallel size folds CP into DP (as in original expert rank generator) - expt_model_block = expert_tp_size * ep_size * pp_size - if world_size % expt_model_block != 0: - raise RuntimeError( - f"world_size ({world_size}) is not divisible by expert_tensor_model_pipeline size ({expt_model_block})" - ) - expt_dp_size = world_size // expt_model_block - use_optimizer_instance_groups = num_distributed_optimizer_instances > 1 - inner_dp_dim: Optional[str] = None - outer_dp_dim: Optional[str] = None - if use_optimizer_instance_groups: - assert expt_dp_size % num_distributed_optimizer_instances == 0, ( - "Expert DP size must be divisible by the number of optimizer instances." - ) - inner_expt_dp_size = expt_dp_size // num_distributed_optimizer_instances - expert_grid = HyperCommGrid( - shape=[expert_tp_size, ep_size, inner_expt_dp_size, num_distributed_optimizer_instances, pp_size], - dim_names=["tp", "ep", "inner_dp", "outer_dp", "pp"], - rank_offset=0, - backend="nccl", - ) - dp_group_dims: list[str] = ["inner_dp", "outer_dp"] - inner_dp_dim = "inner_dp" - outer_dp_dim = "outer_dp" - else: - expert_grid = HyperCommGrid( - shape=[expert_tp_size, ep_size, expt_dp_size, pp_size], - dim_names=["tp", "ep", "dp", "pp"], - rank_offset=0, - backend="nccl", - ) - dp_group_dims = ["dp"] - ep_pg = expert_grid.create_pg(["ep"]) - expt_tp_pg = expert_grid.create_pg(["tp"]) - tp_ep_pg = expert_grid.create_pg(["tp", "ep"]) - tp_ep_pp_pg = expert_grid.create_pg(["tp", "ep", "pp"]) - expt_dp_pg = expert_grid.create_pg(dp_group_dims) - - # Embedding and position-embedding groups - embd_pg = None - pos_embd_pg = None - # Enumerate ranks per PP group - pp_rank_lists = grid._gen_rank_enum(["pp"]) - # Determine embedding ranks for each pp group - embedding_rank_lists: list[list[int]] = [] - pos_embedding_rank_lists: list[list[int]] = [] - for ranks in pp_rank_lists: - if not ranks: - continue - if get_embedding_ranks is not None: - # Use custom callback to determine embedding ranks - embedding_rank_lists.append(get_embedding_ranks(ranks, pp_size)) - else: - # Default: embedding_ranks are first and last pp stage (or only one if pp_size==1) - embedding_rank_lists.append([ranks[0]] if len(ranks) == 1 else [ranks[0], ranks[-1]]) - if get_position_embedding_ranks is not None: - # Use custom callback to determine position embedding ranks - pos_embedding_rank_lists.append(get_position_embedding_ranks(ranks, pp_size)) - else: - # Default: position embedding ranks are first pp stage only - pos_embedding_rank_lists.append([ranks[0]]) - if embedding_rank_lists: - embd_pg, _ = torch.distributed.new_subgroups_by_enumeration(embedding_rank_lists, backend="nccl") - if pos_embedding_rank_lists: - pos_embd_pg, _ = torch.distributed.new_subgroups_by_enumeration(pos_embedding_rank_lists, backend="nccl") - - # Build Partial-Distributed-Optimizer groups for Expert DP when multiple instances are used. - intra_expt_dp_pg = None - inter_dist_opt_pg = None - intra_dist_opt_pg = None - if inner_dp_dim is not None and outer_dp_dim is not None: - intra_expt_dp_pg = expert_grid.create_pg([inner_dp_dim]) - inter_dist_opt_pg = expert_grid.create_pg([outer_dp_dim]) - # Match distributed optimizer instance grouping from parallel_state: - # combine tp-ep-pp ranks across the intra-partial DP slice. - intra_dist_opt_pg = expert_grid.create_pg(["tp", "ep", inner_dp_dim, "pp"]) - - # Build ProcessGroupCollection with available groups. - pg_collection = ProcessGroupCollection( - tp=tp_pg, - pp=pp_pg, - mp=mp_pg, - embd=embd_pg, - pos_embd=pos_embd_pg, - cp=cp_pg, - tp_cp=tp_cp_pg, - hcp=None, - ep=ep_pg, - expt_tp=expt_tp_pg, - tp_ep=tp_ep_pg, - tp_ep_pp=tp_ep_pp_pg, - tp_dp_cp=tp_dp_cp_pg, - dp=dp_pg, - dp_cp=dp_cp_pg, - expt_dp=expt_dp_pg, - intra_dp_cp=dp_cp_pg, - intra_expt_dp=intra_expt_dp_pg if intra_expt_dp_pg is not None else expt_dp_pg, - inter_dist_opt=inter_dist_opt_pg, - intra_dist_opt=intra_dist_opt_pg, - ) - return pg_collection - - -def _setup_flight_recorder_env(dist_config: DistributedInitConfig) -> None: - """Set flight recorder env vars based on config or pre-existing environment. - - Priority: pre-existing env var > config value. If no dump path is provided - (either via config or env), no env vars are set. - """ - _fr_path = ( - os.environ.get("TORCH_FR_DUMP_TEMP_FILE") - or os.environ.get("TORCH_NCCL_DEBUG_INFO_TEMP_FILE") - or dist_config.flight_recorder_dump_path - ) - if _fr_path is None: - return - - _fr_env_defaults = { - "TORCH_FR_DUMP_TEMP_FILE": _fr_path, - "TORCH_NCCL_DEBUG_INFO_TEMP_FILE": _fr_path, - "TORCH_NCCL_TRACE_BUFFER_SIZE": str(dist_config.flight_recorder_trace_buffer_size), - "TORCH_NCCL_DUMP_ON_TIMEOUT": str(int(dist_config.flight_recorder_dump_on_timeout)), - "TORCH_INCLUDE_STACK_TRACE": str(int(dist_config.flight_recorder_include_stack_trace)), - "TORCH_INCLUDE_ONLY_ACTIVE": str(int(dist_config.flight_recorder_include_only_active)), - "TORCH_NCCL_EXTRA_DUMP_ON_EXEC": str(int(dist_config.flight_recorder_extra_dump_on_exec)), - } - for _var, _default in _fr_env_defaults.items(): - if _var in os.environ: - warnings.warn( - f"Flight recorder: env var {_var} is already set to " - f"'{os.environ[_var]}'; ignoring config value '{_default}'.", - stacklevel=2, - ) - else: - os.environ[_var] = _default - if get_rank_safe() == 0: - print( - "Flight recorder env vars:\n" + "\n".join(f" {k}={os.environ[k]}" for k in _fr_env_defaults), - flush=True, - ) - - -def _initialize_distributed( - model_config: TransformerConfig, - dist_config: DistributedInitConfig, - num_distributed_optimizer_instances: int, - get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]], - get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]], - restart_store: Optional[torch.distributed.Store] = None, - use_inprocess_restart: bool = False, -) -> ProcessGroupCollection: - """Initialize torch.distributed and core model parallel.""" - - device_count = torch.cuda.device_count() - if torch.distributed.is_initialized(): - if get_rank_safe() == 0: - print( - "torch distributed is already initialized, skipping initialization ...", - flush=True, - ) - - else: - if get_rank_safe() == 0: - print("> initializing torch distributed ...", flush=True) - - # Manually set the device ids. - if device_count > 0: - if dist_config.external_gpu_device_mapping: - torch.cuda.set_device(0) - else: - torch.cuda.set_device(get_local_rank_preinit()) - - # Set to non-default stream for cudagraph capturing. - if model_config.cuda_graph_impl == "transformer_engine": - torch.cuda.set_stream(torch.cuda.Stream()) - - # Ensure MASTER_ADDR and MASTER_PORT are set for distributed initialization - # These may come from torchrun, SLURM, or defaults - if "MASTER_ADDR" not in os.environ: - os.environ["MASTER_ADDR"] = get_master_addr_safe() - if "MASTER_PORT" not in os.environ: - os.environ["MASTER_PORT"] = str(get_master_port_safe()) - - _setup_flight_recorder_env(dist_config) - - # Call the init process - init_process_group_kwargs = { - "backend": dist_config.distributed_backend, - "world_size": get_world_size_safe(), - "rank": get_rank_safe(), - "store": restart_store, - "timeout": datetime.timedelta(minutes=dist_config.distributed_timeout_minutes), - } - - torch.distributed.init_process_group(**init_process_group_kwargs) - - # Force NCCL backend initialization if using in-process restart - if use_inprocess_restart: - force_nccl_backend_init(torch.cuda.current_device()) - - if dist_config.external_gpu_device_mapping: - torch.distributed.barrier(device_ids=[0]) - else: - torch.distributed.barrier(device_ids=[get_local_rank_preinit()]) - - # Set the tensor model-parallel, pipeline model-parallel, and - # data-parallel communicators. - - if device_count == 0: - if dist_config.use_decentralized_pg or dist_config.distributed_backend == "nccl": - raise RuntimeError("Cannot initialize parallel groups with no CUDA devices available (device_count=0)") - - if dist_config.use_decentralized_pg: - # Use HyperCommGrid to create local parallel groups passed through functions - # instead of relying on mcore's global parallel state (mpu) variables. - parallel_state._set_global_memory_buffer() - pg_collection = _create_pg_collection( - model_config, - num_distributed_optimizer_instances, - get_embedding_ranks=get_embedding_ranks, - get_position_embedding_ranks=get_position_embedding_ranks, - ) - if get_rank_safe() == 0: - tp = int(model_config.tensor_model_parallel_size) - pp = int(model_config.pipeline_model_parallel_size) - cp = int(model_config.context_parallel_size) if getattr(model_config, "context_parallel_size", 1) else 1 - dp = torch.distributed.get_world_size() // (tp * pp * cp) - print(f"> initialized HyperCommGrid with tp={tp}, pp={pp}, cp={cp}, dp={dp}") - return pg_collection - else: - # Use the original mcore parallel_state.initialize_model_parallel approach - if parallel_state.model_parallel_is_initialized(): - print("model parallel is already initialized") - else: - parallel_state.initialize_model_parallel( - tensor_model_parallel_size=model_config.tensor_model_parallel_size, - pipeline_model_parallel_size=model_config.pipeline_model_parallel_size, - virtual_pipeline_model_parallel_size=model_config.virtual_pipeline_model_parallel_size, - pipeline_model_parallel_comm_backend=model_config.pipeline_model_parallel_comm_backend, - context_parallel_size=model_config.context_parallel_size, - hierarchical_context_parallel_sizes=model_config.hierarchical_context_parallel_sizes, - hybrid_context_parallel=model_config.hybrid_context_parallel, - expert_model_parallel_size=model_config.expert_model_parallel_size, - num_distributed_optimizer_instances=num_distributed_optimizer_instances, - expert_tensor_parallel_size=model_config.expert_tensor_parallel_size, - distributed_timeout_minutes=dist_config.distributed_timeout_minutes, - nccl_communicator_config_path=dist_config.nccl_communicator_config_path, - order="tp-cp-ep-dp-pp" if not dist_config.use_tp_pp_dp_mapping else "tp-cp-ep-pp-dp", - get_embedding_ranks=get_embedding_ranks, - get_position_embedding_ranks=get_position_embedding_ranks, - create_gloo_process_groups=dist_config.use_gloo_process_groups, - use_sharp=dist_config.use_sharp, - high_priority_stream_groups=dist_config.high_priority_stream_groups, - sharp_enabled_group=dist_config.sharp_enabled_group, - ) - if get_rank_safe() == 0: - print( - f"> initialized tensor model parallel with size " - f"{parallel_state.get_tensor_model_parallel_world_size()}" - ) - print( - f"> initialized pipeline model parallel with size " - f"{parallel_state.get_pipeline_model_parallel_world_size()}" - ) - # Return a ProcessGroupCollection using mpu process groups - return ProcessGroupCollection.use_mpu_process_groups() - - -def _set_random_seed( - seed_: int, - data_parallel_random_init: bool = False, - te_rng_tracker: bool = False, - inference_rng_tracker: bool = False, - use_cudagraphable_rng: bool = False, - *, - pg_collection: ProcessGroupCollection, -) -> None: - """Set random seed for reproducability.""" - assert seed_ is not None and seed_ > 0, f"Seed ({seed_}) should be a positive integer." - - import random - - import numpy as np - - current_rank = torch.distributed.get_rank() - # Ensure that different pipeline MP stages get different seeds. - pp_rank = torch.distributed.get_group_rank(pg_collection.pp, current_rank) - seed = seed_ + (100 * pp_rank) - # Ensure different data parallel ranks get different seeds - if data_parallel_random_init: - dp_rank = torch.distributed.get_group_rank(pg_collection.dp, current_rank) - seed = seed + (10 * dp_rank) - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - if torch.cuda.device_count() > 0: - # Derive TP/EP/ETP ranks from provided process groups using helper utils - tp_rank = get_pg_rank(pg_collection.tp) - ep_rank = get_pg_rank(pg_collection.ep) - etp_rank = get_pg_rank(pg_collection.expt_tp) - - tensor_parallel.model_parallel_cuda_manual_seed( - seed, - te_rng_tracker, - inference_rng_tracker, - use_cudagraphable_rng, - tp_rank=tp_rank, - ep_rank=ep_rank, - etp_rank=etp_rank, - ) - - -def _warmup_jit_function(model_config: TransformerConfig, micro_batch_size: int) -> None: - """Compilie JIT functions before the main training steps""" - if model_config.bf16: - dtype = torch.bfloat16 - elif model_config.fp16: - dtype = torch.float16 - else: - dtype = torch.float32 - # Warmup fused bias+gelu - bias = torch.rand( - model_config.ffn_hidden_size // model_config.tensor_model_parallel_size, - dtype=dtype, - device="cuda", - ) - input = torch.rand( - ( - model_config.seq_length // model_config.context_parallel_size, - micro_batch_size, - model_config.ffn_hidden_size // model_config.tensor_model_parallel_size, - ), - dtype=dtype, - device="cuda", - ) - # Warmup JIT fusions with the input grad_enable state of both forward - # prop and recomputation - for bias_grad, input_grad in zip([True, True], [False, True]): - bias.requires_grad, input.requires_grad = bias_grad, input_grad - for _ in range(5): - if model_config.activation_func == F.silu: - output = bias_swiglu(input, bias) - else: - output = bias_gelu(bias, input) - del bias, input, output - - # Warmup fused bias+dropout+add - if model_config.sequence_parallel: - tp_world_size = int(model_config.tensor_model_parallel_size) - seq_length = model_config.seq_length // tp_world_size - else: - seq_length = model_config.seq_length - input = torch.rand( - ( - seq_length // model_config.context_parallel_size, - micro_batch_size, - model_config.hidden_size, - ), - dtype=dtype, - device="cuda", - ) - residual = torch.rand( - ( - seq_length // model_config.context_parallel_size, - micro_batch_size, - model_config.hidden_size, - ), - dtype=dtype, - device="cuda", - ) - bias = torch.rand((model_config.hidden_size), dtype=dtype, device="cuda").expand_as(residual) - dropout_rate = 0.1 - # Warmup JIT fusions with the input grad_enable state of both forward - # prop and recomputation - for input_grad, bias_grad, residual_grad in zip([False, True], [True, True], [True, True]): - input.requires_grad = input_grad - bias.requires_grad = bias_grad - residual.requires_grad = residual_grad - for _ in range(5): - output = bias_dropout_add_fused_train([input, bias], residual, dropout_rate) - del bias, input, residual, output - torch.cuda.empty_cache() - - -def force_nccl_backend_init(device_id: torch.device) -> None: - """Force NCCL backend initialization for in-process restart compatibility. - - The nvidia-resiliency-ext in-process restart uses destroy_process_group to - terminate the NCCL backend, which does not terminate NCCL kernels if the NCCL - backend wasn't fully initialized before additional distributed subgroups are created. - - This function forces full initialization of the NCCL backend by performing - a simple all_reduce operation. - - Args: - device_id: CUDA device ID to use for the dummy tensor operation - """ - tensor = torch.ones(128, device=device_id) - torch.distributed.all_reduce(tensor) - torch.cuda.synchronize() - -``` - diff --git a/skills/nemotron-customize/context/mbridge-pretrain.txt b/skills/nemotron-customize/context/mbridge-pretrain.txt deleted file mode 100644 index 43d487b2e..000000000 --- a/skills/nemotron-customize/context/mbridge-pretrain.txt +++ /dev/null @@ -1,13770 +0,0 @@ - -/Users/mromeijn/src/Megatron-Bridge -├── docs -│ ├── models -│ │ ├── llm -│ │ │ ├── README.md * -│ │ │ ├── index.md * -│ │ │ ├── nemotron3-super.md * -│ │ │ ├── nemotron3.md * -│ │ │ └── nemotronh.md * -│ │ ├── vlm -│ │ └── README.md * -│ ├── training -│ │ ├── images -│ │ │ ├── activation-recomputation-example-1.jpg * -│ │ │ ├── activation-recomputation-example-2.jpg * -│ │ │ ├── canonical_lora.png * -│ │ │ ├── performant_lora.png * -│ │ │ ├── pp_comm_overlap.png * -│ │ │ └── tp_comm_overlap.png * -│ │ ├── README.md * -│ │ ├── activation-recomputation.md * -│ │ ├── attention-optimizations.md * -│ │ ├── callbacks.md * -│ │ ├── checkpointing.md * -│ │ ├── communication-overlap.md * -│ │ ├── config-container-overview.md * -│ │ ├── cpu-offloading.md * -│ │ ├── cuda-graphs.md * -│ │ ├── distillation.md * -│ │ ├── entry-points.md * -│ │ ├── hybrid-context-parallel.md * -│ │ ├── logging.md * -│ │ ├── megatron-fsdp.md * -│ │ ├── mixed-precision.md * -│ │ ├── multi-token-prediction.md * -│ │ ├── optimizer-scheduler.md * -│ │ ├── packed-sequences.md * -│ │ ├── peft.md * -│ │ ├── profiling.md * -│ │ ├── pruning.md * -│ │ ├── resiliency.md * -│ │ └── training-loop-settings.md * -│ ├── images -│ ├── modelopt -│ ├── releases -│ ├── README.md * -│ ├── index.md * -│ ├── parallelisms.md * -│ ├── performance-guide.md * -│ ├── performance-summary.md * -│ └── recipe-usage.md * -├── examples -│ ├── models -│ │ ├── gpt_oss -│ │ │ ├── README.md * -│ │ │ └── slurm_pretrain.sh * -│ │ ├── nemotron_3 -│ │ │ ├── nano -│ │ │ │ ├── pretrain_nemotron_3_nano.py * + -│ │ │ │ └── slurm_pretrain.sh * -│ │ │ ├── super -│ │ │ │ ├── pretrain_nemotron_3_super.py * + -│ │ │ │ └── slurm_pretrain.sh * -│ │ │ └── README.md * -│ │ ├── audio_lm -│ │ │ ├── qwen2_audio -│ │ │ └── qwen3_asr -│ │ ├── bailing -│ │ ├── minimax_m2 -│ │ ├── qwen3_next -│ │ │ └── conf -│ │ ├── sarvam -│ │ └── vlm -│ │ ├── gemma3_vl -│ │ ├── glm_45v -│ │ ├── kimi_k25_vl -│ │ ├── ministral3 -│ │ ├── nemotron_vl -│ │ │ └── ... -│ │ ├── qwen25_omni -│ │ ├── qwen35_vl -│ │ ├── qwen3_vl -│ │ └── qwen_vl -│ │ └── ... -│ ├── conversion -│ │ ├── adapter -│ │ └── compare_hf_and_megatron -│ ├── decentralized_pg -│ ├── diffusion -│ │ └── recipes -│ │ ├── flux -│ │ │ └── ... -│ │ └── wan -│ │ └── ... -│ ├── distillation -│ │ └── llama -│ │ └── conf -│ ├── evaluation -│ │ └── utils -│ ├── inference -│ │ └── vlm -│ ├── long_context -│ ├── peft -│ ├── quantization -│ │ └── conf -│ ├── resiliency -│ │ ├── fault_tolerance -│ │ └── straggler_detection -│ └── rl -├── scripts -│ ├── training -│ │ ├── README.md * -│ │ ├── launch_with_nemo_run.py * + -│ │ ├── launch_with_sbatch.sh * -│ │ └── run_recipe.py * + -│ └── performance -│ ├── configs -│ │ ├── deepseek -│ │ ├── gpt_oss -│ │ ├── kimi -│ │ ├── llama -│ │ ├── nemotronh -│ │ ├── qwen -│ │ └── qwen_vl -│ └── utils -├── src -│ └── megatron -│ └── bridge -│ ├── recipes -│ │ ├── nemotronh -│ │ │ ├── __init__.py * + -│ │ │ ├── nemotron_3_nano.py * + -│ │ │ └── nemotron_3_super.py * + -│ │ ├── utils -│ │ │ └── dataset_utils.py * + -│ │ ├── __init__.py * + -│ │ ├── common.py * + -│ │ ├── ... -│ ├── training -│ │ ├── utils -│ │ │ └── omegaconf_utils.py * + -│ │ ├── gpt_step.py * + -│ │ ├── pretrain.py * + -│ │ ├── setup.py * + -│ │ ├── ... -│ ├── data -│ │ └── ... -│ ├── diffusion -│ │ └── ... -│ ├── inference -│ │ └── ... -│ ├── models -│ │ └── ... -│ ├── peft -│ └── utils -├── .github -│ ├── ISSUE_TEMPLATE -│ ├── actions -│ │ └── test-template -│ └── workflows -│ └── config -├── .specstory -├── 3rdparty -│ └── Megatron-LM -│ ├── .github -│ │ ├── ISSUE_TEMPLATE -│ │ ├── actions -│ │ │ └── ... -│ │ ├── scripts -│ │ └── workflows -│ │ └── ... -│ ├── .gitlab -│ │ ├── scripts -│ │ └── stages -│ ├── docker -│ │ ├── common -│ │ └── patches -│ ├── docs -│ │ ├── advanced -│ │ ├── api-guide -│ │ │ └── ... -│ │ ├── developer -│ │ ├── discussions -│ │ │ └── ... -│ │ ├── get-started -│ │ ├── images -│ │ │ └── ... -│ │ ├── models -│ │ └── user-guide -│ │ └── ... -│ ├── examples -│ │ ├── academic_paper_scripts -│ │ │ └── ... -│ │ ├── bert -│ │ ├── export -│ │ │ └── ... -│ │ ├── gpt3 -│ │ ├── inference -│ │ │ └── ... -│ │ ├── llama -│ │ ├── mamba -│ │ ├── mimo -│ │ │ └── ... -│ │ ├── mixtral -│ │ ├── multimodal -│ │ │ └── ... -│ │ ├── post_training -│ │ │ └── ... -│ │ ├── rl -│ │ │ └── ... -│ │ └── t5 -│ ├── images -│ ├── megatron -│ │ ├── core -│ │ │ └── ... -│ │ ├── inference -│ │ ├── legacy -│ │ │ └── ... -│ │ ├── post_training -│ │ ├── rl -│ │ │ └── ... -│ │ └── training -│ │ └── ... -│ ├── scripts -│ ├── tasks -│ ├── tests -│ │ ├── functional_tests -│ │ │ └── ... -│ │ ├── test_utils -│ │ │ └── ... -│ │ └── unit_tests -│ │ └── ... -│ └── tools -│ ├── bert_embedding -│ └── checkpoint -├── docker -│ ├── common -│ └── patches -├── skills -│ ├── adding-model-support -│ ├── code-style -│ ├── developer-guide -│ ├── mlm-bridge-training -│ ├── multi-node-slurm -│ ├── parity-testing -│ ├── perf-techniques -│ │ ├── cuda-graphs -│ │ ├── expert-parallel-overlap -│ │ ├── hybrid-context-parallel -│ │ ├── megatron-fsdp -│ │ ├── moe-comm-overlap -│ │ ├── packed-sequences-long-context -│ │ ├── parallelism-strategies -│ │ ├── sequence-packing -│ │ └── tp-dp-comm-overlap -│ └── resiliency -├── tests -│ ├── functional_tests -│ │ ├── data -│ │ │ ├── energon -│ │ │ └── hf_processors -│ │ ├── diffusion -│ │ │ ├── flux -│ │ │ └── wan -│ │ ├── inference -│ │ ├── launch_scripts -│ │ │ ├── active -│ │ │ └── flaky -│ │ ├── models -│ │ │ ├── qwen3_asr -│ │ │ └── qwen_audio -│ │ └── test_groups -│ │ ├── ckpts -│ │ │ └── ... -│ │ ├── converter -│ │ ├── data -│ │ │ └── ... -│ │ ├── diffusion -│ │ │ └── ... -│ │ ├── models -│ │ │ └── ... -│ │ ├── quantization -│ │ │ └── ... -│ │ ├── recipes -│ │ ├── training -│ │ └── utils -│ └── unit_tests -│ ├── data -│ │ ├── builders -│ │ ├── datasets -│ │ ├── energon -│ │ ├── mimo -│ │ └── vlm_datasets -│ ├── diffusion -│ │ ├── data -│ │ │ └── ... -│ │ ├── model -│ │ │ └── ... -│ │ └── recipes -│ │ └── ... -│ ├── inference -│ │ └── vlm -│ ├── models -│ │ ├── common -│ │ ├── decorators -│ │ ├── deepseek -│ │ ├── gemma -│ │ ├── gemma_vl -│ │ ├── glm -│ │ ├── glm_vl -│ │ ├── gpt -│ │ ├── gpt_oss -│ │ ├── hf_pretrained -│ │ ├── kimi -│ │ ├── kimi_vl -│ │ ├── llama -│ │ ├── llama_nemotron -│ │ ├── mamba -│ │ ├── mimo -│ │ ├── minimax_m2 -│ │ ├── ministral3 -│ │ ├── mistral -│ │ ├── nemotron -│ │ ├── nemotron_vl -│ │ ├── nemotronh -│ │ ├── olmoe -│ │ ├── qwen -│ │ ├── qwen3_asr -│ │ │ └── ... -│ │ ├── qwen_audio -│ │ ├── qwen_omni -│ │ │ └── ... -│ │ ├── qwen_vl -│ │ │ └── ... -│ │ └── sarvam -│ ├── peft -│ ├── recipes -│ │ ├── gemma -│ │ ├── gpt -│ │ ├── kimi -│ │ ├── nemotronh -│ │ ├── qwen -│ │ ├── qwen_vl -│ │ │ └── ... -│ │ └── utils -│ ├── scripts -│ │ └── performance -│ ├── training -│ │ ├── mimo -│ │ ├── mlm_compat -│ │ ├── post_training -│ │ └── utils -│ └── utils -└── tutorials - ├── data - │ └── dclm - ├── recipes - │ └── llama - │ └── conf - └── training - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; depth cap 3; selected files shown. - - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/setup.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -import time -from functools import partial -from typing import Any, Callable, NamedTuple, Optional - -from megatron.bridge.models.common import ModelBuilder, ModelConfig -from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig -from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig -from megatron.bridge.models.model_provider import ModelProviderMixin -from megatron.bridge.models.transformer_config import TransformerConfig -import torch -from megatron.core.config import set_experimental_flag -from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig, finalize_model_grads -from megatron.core.distributed.fsdp.mcore_fsdp_adapter import FullyShardedDataParallel as megatron_FSDP -from megatron.core.jit import disable_jit_fuser -from megatron.core.optimizer import MegatronOptimizer -from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler -from megatron.core.process_groups_config import ProcessGroupCollection -from megatron.core.rerun_state_machine import RerunDataIterator -from megatron.core.transformer import MegatronModule - -from megatron.bridge.data.loaders import setup_data_iterators -from megatron.bridge.training.callbacks import CallbackContext, CallbackManager, should_fire -from megatron.bridge.models import GPTModelProvider, T5ModelProvider -from megatron.bridge.training import fault_tolerance -from megatron.bridge.training.checkpointing import ( - _load_checkpoint_from_path, - checkpoint_exists, - CheckpointLoadContext, - CheckpointManager, - create_checkpoint_manager, -) -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.initialize import initialize_megatron, set_jit_fusion_options -from megatron.bridge.training.optim import setup_optimizer -from megatron.bridge.training.state import GlobalState -from megatron.bridge.training.tensor_inspect import ( - finalize_tensor_inspect_post_model_initialization, - initialize_tensor_inspect_pre_model_initialization, -) -from megatron.bridge.training.tokenizers.tokenizer import build_tokenizer -from megatron.bridge.training.utils.log_utils import append_to_progress_log, barrier_and_log, setup_logging -from megatron.bridge.utils.common_utils import get_rank_safe, print_rank_0 - -class SetupOutput(NamedTuple): - """Represents the output of the main setup function. - - Contains all the initialized components necessary for training or evaluation. - - Attributes: - state: The global state object holding configuration and runtime information. - model: The initialized Megatron model. - optimizer: The initialized optimizer. - scheduler: The initialized learning rate scheduler. - train_data_iterator: The data iterator for the training dataset, if applicable. - valid_data_iterator: The data iterator for the validation dataset, if applicable. - test_data_iterator: The data iterator for the testing dataset, if applicable. - checkpoint_manager: The checkpoint manager for save/load operations. - pg_collection: The process group collection initialized for this run. - """ - - state: GlobalState - model: MegatronModule - optimizer: MegatronOptimizer - scheduler: OptimizerParamScheduler - train_data_iterator: Optional[RerunDataIterator | list[RerunDataIterator]] - valid_data_iterator: Optional[RerunDataIterator | list[RerunDataIterator]] - test_data_iterator: Optional[RerunDataIterator | list[RerunDataIterator]] - checkpoint_manager: CheckpointManager - pg_collection: ProcessGroupCollection - - -def setup( - state: GlobalState, - train_valid_test_datasets_provider: Callable[..., tuple[Optional[Any], Optional[Any], Optional[Any]]], - get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None, - get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None, - restart_store: Optional[torch.distributed.Store] = None, - callback_manager: CallbackManager | None = None, -) -> SetupOutput: - """Initialize the training/evaluation environment using an existing GlobalState. - - Performs all runtime setup using the provided `state` and its attached config (`state.cfg`). - This includes: - - enabling Megatron-Core experimental features - - initializing async checkpoint workers (if enabled) - - logging setup - - torch.distributed and model-parallel initialization (via initialize_megatron) - - tokenizer/model/optimizer/scheduler construction - - optional checkpoint load - - dataloader setup - - Args: - state: The GlobalState instance to populate and use throughout setup. - train_valid_test_datasets_provider: Callable returning the train/valid/test datasets or iterators. - get_embedding_ranks: Optional function to determine embedding layer ranks for model-parallel init. - get_position_embedding_ranks: Optional function to determine positional embedding ranks. - restart_store: Optional torch.distributed Store used when in-process restart is enabled. - callback_manager: Optional CallbackManager whose on_data_init_start hook is fired - after the model/optimizer/checkpoint are ready but before any dataset files are - opened. Use this for JIT warmup with mock data and MLPerf init_stop/run_start - logging to ensure no real dataset I/O occurs before run_start is recorded. - - Returns: - SetupOutput containing the populated state, model, optimizer, scheduler, dataloaders, and ckpt context. - """ - cfg = state.cfg - maybe_log_and_save_config(cfg) - - # Conditionally enable experimental features for Megatron Core - set_experimental_flag(cfg.dist.enable_megatron_core_experimental) - - # Disable the JIT fuser if requested - if cfg.dist.disable_jit_fuser: - print_rank_0("Disabling JIT fuser.") - disable_jit_fuser() - - # Initialize async checkpoint worker if enabled (idempotent if already initialized) - state.initialize_async_checkpoint_worker() - - setup_logging( - logging_level=cfg.logger.logging_level, - filter_warning=cfg.logger.filter_warnings, - modules_to_filter=cfg.logger.modules_to_filter, - set_level_for_all_loggers=cfg.logger.set_level_for_all_loggers, - ) - - # pg_collection is returned from initialize_megatron: - # - When use_decentralized_pg=True: uses HyperCommGrid to create local process groups - # - When use_decentralized_pg=False: uses mpu's global parallel state - pg_collection = initialize_megatron( - cfg=cfg, - get_embedding_ranks=get_embedding_ranks, - get_position_embedding_ranks=get_position_embedding_ranks, - restart_store=restart_store, - ) - - # Set CPU affinity for optimal host-device transfers when fine-grained activation offloading is enabled - if cfg.model.fine_grained_activation_offloading: - from megatron.core.pipeline_parallel.utils import set_ideal_affinity_for_current_gpu - - set_ideal_affinity_for_current_gpu() - - timers = state.timers - - if cfg.logger.log_progress: - append_to_progress_log(cfg.checkpoint.save, "Starting job") - - if cfg.ft and cfg.ft.enable_ft_package: - fault_tolerance.setup(cfg, state) - fault_tolerance.maybe_setup_simulated_fault(cfg.ft) - - # Set pytorch JIT layer fusion options and warmup JIT functions. - set_jit_fusion_options(cfg.model, cfg.train.micro_batch_size) - - # Adjust the startup time so it reflects the largest value. - # This will be closer to what scheduler will see (outside of - # image ... launches. - start_time_tensor = torch.tensor([state.start_time], dtype=torch.double, device="cuda") - torch.distributed.all_reduce(start_time_tensor, op=torch.distributed.ReduceOp.MIN) - state.start_time = start_time_tensor.item() - - print_rank_0("time to initialize megatron (seconds): {:.3f}".format(time.time() - state.start_time)) - barrier_and_log("after megatron is initialized") - - # Create checkpoint manager for save/load operations. - checkpoint_manager = create_checkpoint_manager(cfg.checkpoint) - - # Tokenizer - timers("tokenizer-setup", log_level=0).start(barrier=True) - tokenizer = build_tokenizer(cfg.tokenizer) - # Handle model vocab_size configuration with proper validation - cfg.model.vocab_size, cfg.model.should_pad_vocab = _validate_and_set_vocab_size( - model_vocab_size=cfg.model.vocab_size, - tokenizer_vocab_size=tokenizer.vocab_size, - ) - - cfg.dataset.tokenizer = tokenizer - timers("tokenizer-setup").stop() - barrier_and_log("after tokenizer is built") - - # Initialize NVIDIA DLFw Inspect early (this must happen before TE modules are constructed) - initialize_tensor_inspect_pre_model_initialization(cfg.tensor_inspect) - - # Model, optimizer, and learning rate. - timers("model-and-optimizer-setup", log_level=0).start(barrier=True) - - # Register PEFT pre-wrap hook if PEFT is configured - if cfg.peft is not None: - peft_hook = _create_peft_pre_wrap_hook(cfg, state) - _register_pre_wrap_hook(cfg.model, peft_hook) - print_rank_0("Registered PEFT pre-wrap hook") - - if getattr(cfg.model, "restore_modelopt_state", False): - from megatron.bridge.training.post_training.checkpointing import load_modelopt_state - - def modelopt_pre_wrap_hook(model): - from megatron.bridge.training.post_training.checkpointing import has_modelopt_state - - # Check which checkpoint path has modelopt state - if cfg.checkpoint.pretrained_checkpoint and has_modelopt_state(cfg.checkpoint.pretrained_checkpoint): - checkpoint_path = cfg.checkpoint.pretrained_checkpoint - elif cfg.checkpoint.load and has_modelopt_state(cfg.checkpoint.load): - checkpoint_path = cfg.checkpoint.load - else: - raise RuntimeError( - f"No modelopt_state found in pretrained_checkpoint={cfg.checkpoint.pretrained_checkpoint} " - f"or load={cfg.checkpoint.load}" - ) - - load_modelopt_state(model, checkpoint_path) - return model - - _register_pre_wrap_hook(cfg.model, modelopt_pre_wrap_hook) - - model = _build_distributed_model(cfg, pg_collection) - - cfg.model.timers = timers - cfg.optimizer.timers = timers - optimizer, scheduler = setup_optimizer( - optimizer_config=cfg.optimizer, - scheduler_config=cfg.scheduler, - model=model, - use_gloo_process_groups=cfg.dist.use_gloo_process_groups, - # Only pass pg_collection when use_decentralized_pg is True. - # When False, mcore's optimizer will use parallel_state directly which supports Gloo. - pg_collection=pg_collection if cfg.dist.use_decentralized_pg else None, - optimizer_config_override_provider=cfg.optimizer_config_override_provider, - ) - timers("model-and-optimizer-setup").stop() - barrier_and_log("after model, optimizer, and learning rate scheduler are built") - - # Check if a local (non-persistent) checkpoint is available. Local - # checkpoints are independent of global ones — they don't write - # latest_train_state.pt to load_dir, so checkpoint_exists() won't - # find them. - _ckpt_ctx = getattr(checkpoint_manager, "checkpointing_context", {}) - has_local_checkpoint = ( - "local_checkpoint_manager" in _ckpt_ctx - and _ckpt_ctx["local_checkpoint_manager"].find_latest() != -1 - ) - - # For PEFT, the pretrained checkpoint is loaded in the pre-wrap hook - if cfg.peft is not None: - should_load_checkpoint = cfg.checkpoint.load is not None and checkpoint_exists(cfg.checkpoint.load) - if should_load_checkpoint: - # The finetune toggle is explicitly set to True in order to avoid loading optimizer and RNG states - # This is switched off here in order to load these states from the checkpoint - cfg.checkpoint.finetune = False - else: - should_load_checkpoint = ( - (cfg.checkpoint.load is not None and checkpoint_exists(cfg.checkpoint.load)) - or ( - cfg.checkpoint.pretrained_checkpoint is not None - and checkpoint_exists(cfg.checkpoint.pretrained_checkpoint) - ) - or has_local_checkpoint - ) - - if should_load_checkpoint: - timers("load-checkpoint", log_level=0).start(barrier=True) - checkpoint_manager.load(CheckpointLoadContext( - state=state, - model=model, - optimizer=optimizer, - opt_param_scheduler=scheduler, - skip_load_to_model_and_opt=cfg.dist.use_torch_fsdp2 or cfg.dist.use_megatron_fsdp, - )) - timers("load-checkpoint").stop(barrier=True) - timers.log(["load-checkpoint"]) - - # Finalize NVIDIA DLFw Inspect after model is built (attach loggers, module names, parallelism groups) - finalize_tensor_inspect_post_model_initialization( - cfg.tensor_inspect, - model, - state.tensorboard_logger, - state.wandb_logger, - comet_logger=state.comet_logger, - current_training_step=state.train_state.step, - ) - - _update_model_config_funcs( - model, - cfg.model.transformer if isinstance(cfg.model, (GPTModelConfig, MambaModelConfig)) else cfg.model, - cfg.ddp, - optimizer, - align_grad_reduce=cfg.dist.align_grad_reduce, - pg_collection=pg_collection, - ) - - # Fire on_data_init_start before any dataset files are opened. - # This is the correct place for JIT warmup with mock data and MLPerf - # init_stop/run_start logging. - if should_fire(callback_manager, "on_data_init_start"): - context = CallbackContext( - state=state, - model=model, - optimizer=optimizer, - scheduler=scheduler, - user_state=callback_manager.user_state, - ) - callback_manager.fire("on_data_init_start", context) - - # Data stuff. - timers("train/valid/test-data-iterators-setup", log_level=0).start(barrier=True) - if "tokenizer" in inspect.signature(train_valid_test_datasets_provider).parameters: - train_valid_test_datasets_provider = partial(train_valid_test_datasets_provider, tokenizer=tokenizer) - if "pg_collection" in inspect.signature(train_valid_test_datasets_provider).parameters: - train_valid_test_datasets_provider = partial(train_valid_test_datasets_provider, pg_collection=pg_collection) - - train_data_iterator, valid_data_iterator, test_data_iterator = setup_data_iterators( - cfg=cfg, - train_state=state.train_state, - model_length=len(model), - train_valid_test_datasets_provider=train_valid_test_datasets_provider, - dp_group=pg_collection.dp, - ) - timers("train/valid/test-data-iterators-setup").stop() - barrier_and_log("after dataloaders are built") - - # if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None: - # ft_integration.get_rank_monitor_client().init_workload_monitoring() - # ft_timeouts = ft_integration.get_rank_monitor_client().timeouts - # print_rank_0(f"Fault tolerance client initialized. Timeouts: {ft_timeouts}") - - # Print setup timing. - print_rank_0("done with setup ...") - timers.log(["model-and-optimizer-setup", "train/valid/test-data-iterators-setup"], barrier=True) - - return SetupOutput( - state, - model, - optimizer, - scheduler, - train_data_iterator, - valid_data_iterator, - test_data_iterator, - checkpoint_manager, - pg_collection, - ) - - -def _register_pre_wrap_hook(model_cfg: ModelConfig | ModelProviderMixin, hook): - """Register a pre-wrap hook on either ModelConfig or ModelProviderMixin.""" - if isinstance(model_cfg, ModelConfig): - model_cfg.pre_wrap_hooks.append(hook) - else: - model_cfg.register_pre_wrap_hook(hook) - - -def _build_distributed_model(cfg: ConfigContainer, pg_collection: ProcessGroupCollection) -> list[MegatronModule]: - """Build distributed model from either ModelConfig or ModelProviderMixin.""" - model_config = cfg.model - if isinstance(model_config, ModelConfig): - builder_cls = model_config.get_builder_cls() - builder = builder_cls(model_config) - return builder.build_distributed_models( - pg_collection=pg_collection, - ddp_config=cfg.ddp, - overlap_param_gather_with_optimizer_step=cfg.optimizer.overlap_param_gather_with_optimizer_step, - use_megatron_fsdp=cfg.dist.use_megatron_fsdp, - use_torch_fsdp2=cfg.dist.use_torch_fsdp2, - data_parallel_random_init=cfg.rng.data_parallel_random_init, - ) - else: - return model_config.provide_distributed_model( - ddp_config=cfg.ddp, - use_megatron_fsdp=cfg.dist.use_megatron_fsdp, - use_torch_fsdp2=cfg.dist.use_torch_fsdp2, - overlap_param_gather_with_optimizer_step=cfg.optimizer.overlap_param_gather_with_optimizer_step, - data_parallel_random_init=cfg.rng.data_parallel_random_init, - pg_collection=pg_collection, - ) - - -def _update_model_config_funcs( - model: MegatronModule, - model_config: TransformerConfig, - ddp_config: DistributedDataParallelConfig, - optimizer: Optional[MegatronOptimizer], - *, - align_grad_reduce: bool = True, - pg_collection: Optional[ProcessGroupCollection] = None, -) -> None: - """Update model config sync funcs based on initialized model.""" - if isinstance(model[0], (DistributedDataParallel, megatron_FSDP)) and ddp_config.overlap_grad_reduce: - assert model_config.no_sync_func is None, ( - "When overlap_grad_reduce is True, config.no_sync_func must be None; " - "a custom no_sync_func is not supported when overlapping grad-reduce" - ) - model_config.no_sync_func = [model_chunk.no_sync for model_chunk in model] - if len(model) == 1: - model_config.no_sync_func = model_config.no_sync_func[0] - if align_grad_reduce: - model_config.grad_sync_func = [model_chunk.start_grad_sync for model_chunk in model] - if len(model) == 1: - model_config.grad_sync_func = model_config.grad_sync_func[0] - if ddp_config.overlap_param_gather and ddp_config.align_param_gather: - model_config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model] - if len(model) == 1: - model_config.param_sync_func = model_config.param_sync_func[0] - if optimizer is not None: - model_config.finalize_model_grads_func = partial(finalize_model_grads, pg_collection=pg_collection) - model_config.grad_scale_func = optimizer.scale_loss - - -def _create_peft_pre_wrap_hook( - cfg: ConfigContainer, state: GlobalState -) -> Callable[[list[MegatronModule]], list[MegatronModule]]: - """Create a pre-wrap hook that handles PEFT logic. - - This hook is executed before the model is wrapped with DDP/FSDP and handles: - 1. Loading pretrained checkpoints for PEFT - 2. Applying PEFT transformation to the model - - Args: - cfg: Configuration container - state: Global state object containing timers and other state - - Returns: - A callable hook that can be registered with the model provider - """ - - def peft_pre_wrap_hook(model: list[MegatronModule]) -> list[MegatronModule]: - """Pre-wrap hook that handles PEFT transformation. - - Args: - model: List of base model modules before distributed wrapping - - Returns: - List of potentially PEFT-transformed model modules - """ - # Only apply PEFT logic if PEFT is configured - if cfg.peft is None: - return model - - print_rank_0("Applying PEFT pre-wrap hook...") - - # Load pretrained checkpoint if available - if cfg.checkpoint.pretrained_checkpoint is None or not checkpoint_exists(cfg.checkpoint.pretrained_checkpoint): - raise ValueError(f"Invalid pretrained checkpoint directory found: {cfg.checkpoint.pretrained_checkpoint}") - - # Explicitly set finetune to avoid loading optimizer and RNG states - cfg.checkpoint.finetune = True - state.timers("load-pretrained-checkpoint", log_level=0).start(barrier=True) - print_rank_0(f"Loading base model weights from: {cfg.checkpoint.pretrained_checkpoint}") - - # Directly call load_checkpoint_from path in order to avoid - # the load directory overriding the pretrained checkpoint path - # This is needed to initialize the base model weights first, and then conditionally load adapter states after - _load_checkpoint_from_path( - load_dir=cfg.checkpoint.pretrained_checkpoint, - state=state, - model=model, - optimizer=None, # Don't load optimizer - will be created after PEFT - opt_param_scheduler=None, # Don't load scheduler - will be created after PEFT - checkpointing_context={}, - skip_load_to_model_and_opt=False, - ignore_ckpt_step=True, # ckpt_step applies only to adapter checkpoints, not pretrained base model - ) - state.timers("load-pretrained-checkpoint").stop(barrier=True) - state.timers.log(["load-pretrained-checkpoint"]) - - # Apply PEFT transformation - transformed_model = _apply_peft_transformation(cfg.peft, model) - - return transformed_model - - return peft_pre_wrap_hook - - -def _apply_peft_transformation(peft, base_model: list[MegatronModule]) -> list[MegatronModule]: - """Apply PEFT transformation to the base model. - - Args: - peft: PEFT configuration/object - base_model: Base model before PEFT transformation - - Returns: - Model with PEFT transformation applied - """ - print_rank_0("Applying PEFT transformation...") - transformed_model = peft(base_model, training=True) - peft.set_params_to_save(transformed_model) - - # Log PEFT statistics - model_to_analyze = transformed_model[0] if isinstance(transformed_model, list) else transformed_model - total_params = 0 - trainable_params = 0 - for param in model_to_analyze.parameters(): - param_count = param.numel() - total_params += param_count - if param.requires_grad: - trainable_params += param_count - - print_rank_0("PEFT Statistics:") - print_rank_0(f" Total parameters: {total_params:,}") - print_rank_0(f" Trainable parameters: {trainable_params:,}") - print_rank_0(f" Trainable percentage: {100 * trainable_params / total_params:.2f}%") - - return transformed_model - - -def _validate_and_set_vocab_size(model_vocab_size: Optional[int], tokenizer_vocab_size: int) -> tuple[int, bool]: - """Validate and determine the correct vocab size for the model. - - Args: - model_vocab_size: Vocab size set in model config (can be None) - tokenizer_vocab_size: Unpadded tokenizer vocab size - - Returns: - tuple[int, bool]: The validated unpadded vocab size and padding flag - - vocab_size: The validated unpadded vocab size to use for the model - - should_pad_vocab: True if vocab should be padded, False otherwise - - Raises: - ValueError: If model vocab size is invalid - """ - if model_vocab_size is None: - # If model vocab size is not set, use the tokenizer's vocab size - # Enable padding since this came from tokenizer - return tokenizer_vocab_size, True - elif model_vocab_size < tokenizer_vocab_size: - # Vocab size smaller than tokenizer - raise ValueError( - f"Model vocab_size ({model_vocab_size}) cannot be smaller than tokenizer's vocab_size " - f"({tokenizer_vocab_size})." - ) - else: - # Model vocab size is explicitly set and is >= tokenizer vocab size - # Disable padding since this was explicitly set - if model_vocab_size > tokenizer_vocab_size: - logging.info( - f"Using preset vocab_size: {model_vocab_size} over the tokenizer vocab_size: {tokenizer_vocab_size}, dummy tokens:" - f" {model_vocab_size - tokenizer_vocab_size}." - ) - return model_vocab_size, False - - -def maybe_log_and_save_config(cfg: ConfigContainer) -> None: - """Save configuration to disk and log non-default values on rank 0. - - Instead of printing the full config YAML, this now logs only the values - that differ from Megatron Core defaults, making it easier to spot - unintended configuration deviations. - - The full config can still be saved to a file via logger.save_config_filepath. - """ - - if get_rank_safe() != 0: - return - - if cfg.logger.save_config_filepath is not None: - try: - cfg.to_yaml(cfg.logger.save_config_filepath) - except Exception as e: - print_rank_0(f"Error saving config to file {cfg.logger.save_config_filepath}: {e}") - - cfg.log_non_default_values() - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/training-loop-settings.md -```md -# Training Loop Configuration - -The {py:class}`bridge.training.config.TrainingConfig` contains settings related to the training loop bounds, exit conditions, validation, batch sizing, and memory management. - -## Key Parameters - -Configure these parameters to control core training behavior, resource utilization, and monitoring across distributed setups. - -### Batch Configuration -Define how data is batched and distributed across devices during training. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `micro_batch_size` | `Optional[int]` | `None` | Batch size per model instance (local batch size) | -| `global_batch_size` | `Optional[int]` | `None` | Training batch size across all devices | -| `rampup_batch_size` | `Optional[list[int]]` | `None` | Batch size ramp up: `[start_size, increment, ramp_samples]` | -| `decrease_batch_size_if_needed` | `bool` | `False` | Automatically decrease batch size if needed for fault tolerance | - -The relationship between batch sizes: -- **Global batch size** = `micro_batch_size` × `data_parallel_size` × `gradient_accumulation_steps` -- If `global_batch_size` is not set, it defaults to `micro_batch_size` × `data_parallel_size` - -### Training Duration - -Control when training stops using iteration counts, sample counts, or time-based limits. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `train_iters` | `Optional[int]` | `None` | Total number of iterations to train | -| `train_samples` | `Optional[int]` | `None` | Total number of samples to train | -| `exit_interval` | `Optional[int]` | `None` | Exit after iteration divisible by this value | -| `exit_duration_in_mins` | `Optional[int]` | `None` | Exit after this many minutes | - -**Training Mode Selection** - -Megatron-Bridge supports two modes for specifying training duration: - -1. **Iteration-based training**: Specify `train_iters` to control the total number of training iterations. -2. **Sample-based training**: Specify `train_samples` to control the total number of training samples. - -**Important constraints:** -- You must specify **exactly one** of `train_iters` or `train_samples` - not both. -- When using `train_samples`, training iterations are automatically calculated as `train_samples // global_batch_size`. -- Batch size rampup (`rampup_batch_size`) is not currently supported with sample-based training. -- Your scheduler configuration should match your training mode (see [Learning Rate Scheduling](optimizer-scheduler.md#learning-rate-scheduling)). - -### Validation -Configure validation frequency, duration, and evaluation-only modes. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `eval_iters` | `int` | `100` | Number of iterations for validation/test evaluation | -| `eval_interval` | `Optional[int]` | `1000` | Interval between validation runs | -| `skip_train` | `bool` | `False` | Skip training, only do evaluation and exit | - -**Note:** To control validation behavior: -- Set `eval_iters` to `0` to disable validation entirely (both during and after training). -- Set `eval_interval` to `None` to skip validation during training, but still run validation after training completes. - -### Memory Management -Control GPU memory cleanup and garbage collection to prevent memory issues during training. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `empty_unused_memory_level` | `Literal[0, 1, 2]` | `0` | Call `torch.cuda.empty_cache()` each iteration (0=off, 1=moderate, 2=aggressive) | -| `manual_gc` | `bool` | `False` | Synchronize Python garbage collection across ranks to avoid stragglers | -| `manual_gc_interval` | `int` | `0` | Training step interval for manual garbage collection (0=disabled) | -| `manual_gc_eval` | `bool` | `True` | Enable garbage collection during evaluation when using manual GC | - -### Signal Handling and Exit Conditions -Set up automatic checkpoint saving and clean exit procedures for signal-based interruptions. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `exit_signal_handler` | `bool` | `False` | Save checkpoint and shutdown gracefully on signal detection | -| `exit_signal` | `int` | `signal.SIGTERM` | Signal to handle for graceful shutdown | -| `exit_signal_handler_for_dataloader` | `bool` | `False` | Use signal handler for dataloader workers | - -### Performance Monitoring -Monitor training consistency and synchronization across distributed processes. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `check_weight_hash_across_dp_replicas_interval` | `Optional[int]` | `None` | Check weight hash consistency across data parallel replicas | -| `train_sync_interval` | `Optional[int]` | `None` | CPU-GPU synchronization interval to prevent CPU running ahead | - - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/distillation.md -```md -# Knowledge Distillation - -Megatron Bridge provides a streamlined setup for Knowledge Distillation (KD) training, making it easy to enable and integrate into your workflow. This section explains how to use this feature effectively. - -Knowledge Distillation is a technique where a pre-trained model (the "teacher") transfers its learned knowledge to a second model (the "student"), which is typically smaller and faster. This process helps the student model learn more efficiently by mimicking the behavior of the teacher. KD offers two key advantages over traditional training: faster convergence and higher final accuracy. - -In Megatron Bridge, KD is enabled by NVIDIA Model Optimizer (ModelOpt) — a library to optimize deep-learning models for inference on GPUs. - -## Knowledge Distillation Process - -The KD process involves these steps: - -1. **Loads Checkpoints**: Loads both the student and teacher model checkpoints. -2. **Replaces Loss Function**: Replaces the standard loss function with the KL-Divergence between the output logits (and potentially additional losses between pairs of intermediate model states). -3. **Trains Models**: Runs forward passes on both models, but executes the backward pass only on the student model. -4. **Saves Checkpoints**: Saves only the student model checkpoint, allowing it to be used later in the same manner as before. - -## Limitations - -* Only GPT-based checkpoints are currently supported. -* Student and teacher models must support the same parallelism strategy. -* If Pipeline Parallelism is enabled, intermediate-state based KD losses are only supported on the final pipeline stage. - -## Configuration - -### Knowledge Distillation Config - -You can configure the KD process via the `ModelOptDistillConfig` class or a YAML file. The configuration includes: - -* `logit_layers`: The layer names of student and teacher model logit layers. These names correspond to the PyTorch submodule attributes of the Megatron Core model. (For GPT-based models, this is `"output_layer"`). Default: `["output_layer", "output_layer"]` -* `intermediate_layer_pairs`: A list of pairs of intermediate layer names. These pairs will by default have a Cosine-Similarity loss between them, and if tensor-parallelism is enabled, these layers must have sequence parallel outputs (i.e. LayerNorms), as Cosine loss cannot have a split hidden dimension. Default: `[["decoder.final_layernorm", "decoder.final_layernorm"]]` -* `skip_lm_loss`: Whether to skip the default language modeling (LM) loss. If `false`, it will be added to the distillation loss. (Note it consumes more memory). Default: `true` -* `kd_loss_scale`: Relative scale factor for the distillation loss. The cumulative logits-and-intermediate loss gets scaled to `kd_loss_scale` times the magnitude of the LM loss. Not used if `skip_lm_loss` is `true`. Default: `1.0` -* `logit_kl_temperature`: Temperature variable for KL Divergence loss calculation. Default: `1.0` - -Example YAML configuration: - -```yaml -logit_layers: ["output_layer", "output_layer"] -intermediate_layer_pairs: - - ["decoder.final_layernorm", "decoder.final_layernorm"] -logit_kl_temperature: 2.0 -``` - -## Usage - -### Basic Usage with Default Configuration - -The simplest way to run knowledge distillation is to use or adapt one of the provided recipe scripts. Here's an example for distilling Llama3.2-3B into Llama3.2-1B: - -```bash -uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py -``` - -### Using a Custom YAML Config File - -You can provide a custom YAML configuration file to override default settings: - -```bash -uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \ - --config-file my_custom_config.yaml -``` - -### Using CLI Overrides - -Megatron Bridge supports Hydra-style CLI overrides for flexible configuration: - -```bash -uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \ - model.tensor_model_parallel_size=2 \ - model.teacher.tensor_model_parallel_size=2 -``` - -### Combining YAML and CLI Overrides - -CLI overrides take precedence over YAML configuration: - -```bash -uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \ - --config-file conf/my_config.yaml \ - train.global_batch_size=512 -``` - -## Model Support - -Currently, distillation is supported for GPT and Mamba-based models - -To enable distillation for a model: - -1. Set the `teacher` attribute to the teacher model configuration -2. Configure `kd_config` with desired distillation settings (else uses default) -3. Use `convert_to_distillation_provider()` to convert your existing model provider - -## Checkpointing - -During distillation training: - -* Only the **student model** checkpoints are saved -* Teacher model remains frozen and is not modified -* Checkpoints can be used for inference or further training like any standard checkpoint - -## Best Practices - -1. **Match Parallelism**: Ensure student and teacher use compatible parallelism configurations -2. **Monitor Loss**: Track both distillation loss and (if enabled) language modeling loss -3. **Batch Size**: Use larger batch sizes for better stability during distillation -4. **Learning Rate**: Start with a smaller LR than pretraining -5. **Data Quality**: Use high-quality, diverse training data for best distillation results - -## Troubleshooting - -### Out of Memory Errors - -* Reduce `train.micro_batch_size` -* Increase parallelism sizes -* Set `model.kd_config.skip_lm_loss = True` to save memory - -## References - -For more information on the underlying implementation, see: -* [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/README.md -```md -# Training and Customization - -This directory contains comprehensive documentation for training and customizing models with Megatron Bridge. Learn how to configure training, optimize performance, and customize training workflows. - -## Quick Navigation - -### I want to - -**🚀 Get started with training** -→ Start with [Configuration Container Overview](config-container-overview.md) to understand the training setup - -**⚙️ Configure training parameters** -→ See [Training Loop Settings](training-loop-settings.md) and [Optimizer & Scheduler](optimizer-scheduler.md) - -**📊 Monitor and profile training** -→ Check [Logging](logging.md) and [Profiling](profiling.md) guides - -**💾 Manage checkpoints** -→ Read [Checkpointing](checkpointing.md) for saving and resuming training - -**⚡ Optimize performance** -→ Explore [Performance Guide](../performance-guide.md) and [Performance Summary](../performance-summary.md) - -**🔧 Customize training** -→ See [PEFT](peft.md), [Distillation](distillation.md), [Entry Points](entry-points.md), and [Callbacks](callbacks.md) - -## Core Training Documentation - -### Configuration and Setup - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Configuration Container Overview](config-container-overview.md)** | Central configuration object for all training settings | First time setting up training | -| **[Entry Points](entry-points.md)** | Training entry points and execution flow | Understanding how training starts | -| **[Training Loop Settings](training-loop-settings.md)** | Training loop parameters and configuration | Configuring batch sizes, iterations, validation | - -### Optimization and Performance - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Optimizer & Scheduler](optimizer-scheduler.md)** | Optimizer and learning rate scheduler configuration | Setting up optimization | -| **[Mixed Precision](mixed-precision.md)** | Mixed precision training for memory efficiency | Reducing memory usage | -| **[Communication Overlap](communication-overlap.md)** | Overlapping communication with computation | Optimizing distributed training | -| **[Hybrid Context Parallel](hybrid-context-parallel.md)** | Hierarchical `a2a+p2p` context parallel guidance | Advanced long-sequence scaling | -| **[Attention Optimizations](attention-optimizations.md)** | Optimizing attention mechanisms | Improving training speed | -| **[Activation Recomputation](activation-recomputation.md)** | Gradient checkpointing strategies | Reducing memory footprint | -| **[CPU Offloading](cpu-offloading.md)** | Offloading to CPU for memory management | Working with limited GPU memory | - -### Monitoring and Debugging - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Logging](logging.md)** | Logging configuration and TensorBoard/WandB integration | Monitoring training progress | -| **[Profiling](profiling.md)** | Performance profiling and analysis | Identifying bottlenecks | -| **[Resiliency](resiliency.md)** | Handling failures and recovery | Building robust training pipelines | - -### Advanced Features - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[PEFT](peft.md)** | Parameter-Efficient Fine-Tuning (LoRA, etc.) | Fine-tuning with limited resources | -| **[Packed Sequences](packed-sequences.md)** | Sequence packing for efficiency | Optimizing data loading | -| **[Megatron FSDP](megatron-fsdp.md)** | Stable overview of Megatron FSDP | Choosing an FSDP path | -| **[Distillation](distillation.md)** | Knowledge distillation techniques | Transferring knowledge between models | -| **[Checkpointing](checkpointing.md)** | Checkpoint saving, loading, and resuming | Managing training state | -| **[Callbacks](callbacks.md)** | Inject custom logic into training loop | Custom logging, metrics, third-party integrations | - -## Training Workflow - -A typical training workflow involves: - -1. **Configure Training** - Set up `ConfigContainer` with model, data, and training parameters -2. **Prepare Data** - Configure dataset loading and preprocessing -3. **Set Optimization** - Configure optimizer, scheduler, and mixed precision -4. **Enable Monitoring** - Set up logging and profiling -5. **Configure Checkpointing** - Set up checkpoint saving and resuming -6. **Launch Training** - Start training with configured entry points -7. **Monitor Progress** - Track metrics via logging and profiling -8. **Resume if Needed** - Use checkpointing to resume from saved state - -## Related Documentation - -- **[Main Documentation Index](../index.md)** - Return to main documentation -- **[Performance Guide](../performance-guide.md)** - Comprehensive performance optimization guide -- **[Performance Summary](../performance-summary.md)** - Quick performance reference -- **[Recipe Usage](../recipe-usage.md)** - Using training recipes -- **[Parallelisms](../parallelisms.md)** - Understanding distributed training strategies -- **[Bridge Guide](../bridge-guide.md)** - Working with Hugging Face models - -## Common Training Scenarios - -### 🆕 First-Time Training Setup - -1. [Configuration Container Overview](config-container-overview.md) - Understand the configuration system -2. [Entry Points](entry-points.md) - Learn how to start training -3. [Training Loop Settings](training-loop-settings.md) - Configure basic training parameters -4. [Logging](logging.md) - Set up monitoring - -### ⚡ Performance Optimization - -1. [Performance Guide](../performance-guide.md) - Comprehensive optimization strategies -2. [Mixed Precision](mixed-precision.md) - Enable mixed precision training -3. [Communication Overlap](communication-overlap.md) - Optimize distributed training -4. [Activation Recomputation](activation-recomputation.md) - Reduce memory usage -5. [Profiling](profiling.md) - Identify bottlenecks - -### 💾 Production Training - -1. [Checkpointing](checkpointing.md) - Reliable checkpoint management -2. [Resiliency](resiliency.md) - Handle failures gracefully -3. [Logging](logging.md) - Comprehensive monitoring -4. [Profiling](profiling.md) - Performance analysis - -### 🔧 Customization - -1. [PEFT](peft.md) - Parameter-efficient fine-tuning -2. [Distillation](distillation.md) - Knowledge distillation -3. [Entry Points](entry-points.md) - Custom training workflows -4. [Callbacks](callbacks.md) - Inject custom logic (third-party integrations) - ---- - -**Ready to start training?** Begin with [Configuration Container Overview](config-container-overview.md) or return to the [main documentation](../README.md). - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/communication-overlap.md -```md -# Communication Overlap - -Communication overlap reduces exposed communication cost in distributed training -by hiding collectives or point-to-point transfers under useful compute. - -This page is the stable guide for what communication overlap is, when it tends -to help, and which boundaries are durable across Megatron Bridge. For exact -knobs, code anchors, and verification commands, see: - -- `skills/perf-techniques/tp-dp-comm-overlap/SKILL.md` -- `skills/perf-techniques/expert-parallel-overlap/SKILL.md` - -## What It Is - -In Bridge, communication overlap is a family of related techniques rather than a -single switch: - -| Mode | What gets hidden | Main gate | -|---|---|---| -| DP | gradient reduce-scatter and parameter all-gather | distributed-optimizer overlap path | -| TP | tensor-parallel collectives under layer compute | `CommOverlapConfig.tp_comm_overlap` plus sequence parallelism | -| PP | pipeline send/recv work under schedule execution | pipeline schedule and virtual pipeline layout | -| CP | context-parallel communication inside CP execution paths | CP implementation choice | -| EP | MoE token dispatch/combine communication under expert compute | `overlap_moe_expert_parallel_comm` | - -These paths share the same goal, but they do not share the same enablement -rules, evidence level, or failure modes. - -## What Problem It Solves - -Distributed training often becomes communication-bound before it becomes -compute-bound. Once TP, DP, PP, CP, or EP traffic is visible on the critical -path, adding more GPUs may raise communication time faster than it raises useful -compute. - -Communication overlap addresses that by moving communication earlier or later in -the step so the same transfer can happen while some other part of the model is -already doing useful work. It does not change the training objective. It tries -to reduce idle time. - -## Impacted Training Dimensions - -| Dimension | Effect | Confidence | Why | -|---|---|---|---| -| `speed` | ~0-15% faster step time, mode-dependent | medium | The whole point is to hide communication time, but gain depends strongly on which overlap mode is active and whether communication is actually exposed. EP overlap measured flat to ~13% slower on small-EP Qwen3-30B-A3B, so gains are not guaranteed. | -| `memory` | neutral (some modes add ~1-2 GB for buffers) | low | Overlap itself is usually not a primary memory technique, although some implementations (e.g., TP userbuffers) add buffer or scheduling constraints. | -| `scale` | positive at higher parallelism degrees | medium | Overlap becomes more valuable as communication dominates larger distributed runs. | -| `convergence` | no change expected | medium | The intent is to preserve the same training math, though schedule changes can alter floating-point accumulation order. | -| `stability` | adds operational constraints | medium | More overlap usually means tighter requirements around schedule shape, precision, runtime versions, and feature combinations. | - -## When to Use It - -Enable communication overlap when all of the following are mostly true: - -- the distributed configuration already works correctly without overlap -- communication is a meaningful part of step time -- you are tuning throughput or utilization, not doing first bring-up -- you can benchmark the specific overlap mode you plan to use - -As a rule of thumb: - -| Mode | Good first use case | Recommendation | -|---|---|---| -| DP | distributed optimizer on multi-GPU or multi-node training | Usually worth considering early once optimizer sharding is already chosen. | -| TP | `TP >= 2` with sequence parallelism and TE-enabled path | Benchmark when TP collectives are visible in the profile. | -| PP | interleaved pipeline schedules where p2p overhead is visible | Treat as schedule tuning, not a blanket PP default. | -| CP | large-context runs already using CP | Follow the CP-specific guidance rather than treating it as a separate generic knob. | -| EP | large-scale MoE with many micro-batches and inter-node A2A cost | Most promising at larger EP and with higher-latency dispatcher backends. | - -Measured repo evidence today is strongest for MoE EP overlap. On -Qwen3-30B-A3B with EP=4 and `alltoall` on 2 H100 nodes, EP overlap is -numerically safe at GBS=8 but provides no speedup, and it is about 13% slower -at GBS=64. On Qwen3-Next-80B-A3B with EP=8 and `alltoall` on 8 nodes, the -overlap variants are stable while the non-overlap baseline NaNs, but -`delay_wgrad_compute` is still about 4.8% slower than overlap-only. That makes -EP overlap correctness-backed in this repo, but not yet broadly speedup-backed. - -## When Not to Use It - -Avoid communication overlap when any of these are true: - -- you are still debugging a new distributed setup -- the profile is compute-bound rather than communication-bound -- the required companion feature is missing, such as sequence parallelism for TP -- another feature already imposes conflicting runtime constraints -- you have not benchmarked the exact model and parallelism shape - -For MoE EP overlap specifically, avoid treating it as a default when: - -- `EP <= 4` with `alltoall` on `<= 2` nodes -- the run has very few pipeline micro-batches -- `moe_shared_expert_overlap` must stay enabled -- full recompute or recompute scheduling incompatible with EP overlap is required - -## Feature Interactions - -The most important interactions are: - -- DP overlap is tied to distributed-optimizer behavior rather than a fully independent tuning path. -- TP overlap depends on sequence parallelism and the supported TE overlap path. -- PP and EP overlap interact with virtual pipeline layout when `PP > 1`. -- CP overlap should be reasoned about together with the chosen CP communication type. -- EP overlap with DeepEP or HybridEP requires explicitly switching the dispatcher to `flex`. -- EP overlap and `moe_shared_expert_overlap` are mutually exclusive. -- CUDA graphs plus `delay_wgrad_compute` adds extra TE-version and graph-scope restrictions. -- Launch-time environment tuning can conflict across overlap paths, especially TP or CP overlap versus DeepEP or HybridEP tuning. - -## Bridge Configuration - -Communication overlap is configured through `CommOverlapConfig` plus -mode-specific model settings. There is no single universal toggle — DP, TP, -PP, CP, and EP each have different prerequisites and should be enabled based -on the actual bottleneck. - -For config examples and minimal runnable commands, see: - -- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md) -- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md) - -## Expected Metric Changes - -| Metric | Expected Change | Conditions | Evidence | -|---|---|---|---| -| `step_time` | down | DP overlap with distributed optimizer on communication-heavy runs | expected | -| `step_time` | down | TP overlap with `TP >= 2`, sequence parallelism, and supported TE path | expected | -| `pipeline_idle_time` | down | interleaved PP where p2p cost is visible | expected | -| `step_time` | flat | Qwen3-30B-A3B, EP=4, `alltoall`, 2 nodes, GBS=8 | measured: 822ms baseline vs 827ms overlap | -| `step_time` | up | same model/config, GBS=64 | measured: 4889ms baseline vs 5538ms overlap | -| `step_time` | up | Qwen3-Next-80B-A3B, EP=8, `alltoall`, 8 nodes, `delay_wgrad_compute=True` vs overlap-only | measured: 4912ms vs 4686ms | - -Do not assume one overlap win transfers automatically to another mode. The -correct question is always "which communication path is exposed in this run?" - -## Common Failure Modes - -- TP overlap silently disables itself when sequence parallelism is off or `TP < 2`. -- PP overlap expectations are wrong when the schedule is non-interleaved or VPP is missing. -- EP overlap asserts when `PP > 1` but `virtual_pipeline_model_parallel_size` is unset. -- EP overlap asserts when full recompute, recompute method, or shared-expert overlap stays enabled. -- Setting `moe_flex_dispatcher_backend` alone does not activate DeepEP or HybridEP; the dispatcher must actually switch to `flex`. -- Small-EP `alltoall` MoE runs can get slower because scheduling overhead is larger than the communication being hidden. - -## Related Docs - -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/cuda-graphs.md](cuda-graphs.md) -- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md) -- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md) -- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md) -- [skills/perf-techniques/moe-comm-overlap/SKILL.md](../skills/perf-techniques/moe-comm-overlap/SKILL.md) -- [skills/perf-techniques/moe-comm-overlap/card.yaml](../skills/perf-techniques/moe-comm-overlap/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/common.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from megatron.core.distributed import DistributedDataParallelConfig - -from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -from megatron.bridge.peft.lora import LoRA -from megatron.bridge.recipes.utils.finetune_utils import default_squad_config -from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedInitConfig, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) - - -def _pretrain_common() -> ConfigContainer: - """Create a base pre-training ConfigContainer with common defaults for any language model. - - This function returns a ConfigContainer template with sensible defaults. - The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` before use. - - Returns: - ConfigContainer: Base configuration template for pre-training. - """ - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default optimizer and scheduler - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=500, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=3e-4, - min_lr=3e-5, - ) - - cfg = ConfigContainer( - # Model - MUST be set by each recipe before use - model=None, # type: ignore[arg-type] - # Training config - train=TrainingConfig( - train_iters=300000, - global_batch_size=32, - micro_batch_size=2, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=500, - eval_iters=32, - ), - # Optimizer and scheduler - optimizer=opt_cfg, - scheduler=scheduler_cfg, - # DDP config - these are the commonly overridden settings - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - ), - # Dataset config - uses mock data by default - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=4096, - num_dataset_builder_threads=1, - blend=None, # Mock data mode - blend_per_split=None, - split="9999,8,2", - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - # Logger config - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - # Tokenizer - placeholder, each recipe should set tokenizer_model - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=None, # Must be set by each recipe - ), - # Checkpoint config - checkpoint=CheckpointConfig( - save_interval=500, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - # RNG config - rng=RNGConfig(seed=1234), - # Distributed init config - dist=DistributedInitConfig(), - comm_overlap=None, - # Mixed precision - bf16 by default - mixed_precision="bf16_mixed", - ) - - return cfg - - -def _sft_common() -> ConfigContainer: - """Create a base SFT (Supervised Fine-Tuning) ConfigContainer with common defaults. - - This function returns a ConfigContainer template with sensible defaults for full SFT - (not LoRA/DoRA). The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` - before use. - - Key differences from pre-training: - - Uses HFDatasetConfig with SQuAD as default dataset - - Lower learning rate (5e-6) suitable for full fine-tuning - - Fewer training iterations (1000) - - Smaller batch sizes - - Supports pretrained_checkpoint loading - - No PEFT (full parameter training) - - Returns: - ConfigContainer: Base configuration template for full SFT. - """ - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for SFT - seq_length = 2048 - - # Packed sequence is enabled by default for training efficiency - # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1 - packed_sequence = True - pad_seq_to_mult = 1 # Override in model config if context_parallel_size > 1 - - # Optimizer and scheduler with lower LR for full SFT - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=50, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=5e-6, # Lower LR for full fine-tuning - min_lr=0.0, - adam_beta2=0.98, # Common for fine-tuning - ) - - cfg = ConfigContainer( - # Model - MUST be set by each recipe before use - model=None, # type: ignore[arg-type] - # Training config - shorter training for SFT - train=TrainingConfig( - train_iters=1000, - global_batch_size=128, - micro_batch_size=1, - ), - validation=ValidationConfig( - eval_interval=100, - eval_iters=32, - ), - # Optimizer and scheduler - optimizer=opt_cfg, - scheduler=scheduler_cfg, - # DDP config - minimal settings, model-specific configs can override - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), - # Dataset config - uses SQuAD with packed sequences by default - dataset=default_squad_config( - seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult - ), - # Logger config - logger=LoggerConfig( - log_interval=1, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - # Tokenizer - placeholder, each recipe should set tokenizer_model - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=None, # Must be set by each recipe - ), - # Checkpoint config with pretrained_checkpoint support - checkpoint=CheckpointConfig( - save_interval=100, - save=checkpoint_dir, - load=checkpoint_dir, - pretrained_checkpoint=None, # Set to load from pretrained weights - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - # RNG config - different seed from pretrain - rng=RNGConfig(seed=5678), - # Distributed init config - dist=DistributedInitConfig(), - comm_overlap=None, - # Mixed precision - bf16 by default - mixed_precision="bf16_mixed", - # No PEFT for full SFT - peft=None, - ) - - return cfg - - -def _peft_common() -> ConfigContainer: - """Create a base PEFT (Parameter-Efficient Fine-Tuning) ConfigContainer with LoRA defaults. - - This function returns a ConfigContainer template with sensible defaults for PEFT - using LoRA. The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` - before use. - - Key differences from full SFT: - - Higher learning rate (1e-4) suitable for adapter training - - LoRA enabled by default with standard settings (dim=32, alpha=32) - - Targets all linear layers: linear_qkv, linear_proj, linear_fc1, linear_fc2 - - Returns: - ConfigContainer: Base configuration template for PEFT with LoRA. - """ - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for PEFT - seq_length = 2048 - - # Packed sequence is enabled by default for training efficiency - # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1 - packed_sequence = True - pad_seq_to_mult = 1 # Override in model config if context_parallel_size > 1 - - # Optimizer and scheduler with higher LR for PEFT (only training adapters) - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=50, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=1e-4, # Higher LR for adapter training - min_lr=0.0, - adam_beta2=0.98, # Common for fine-tuning - ) - - cfg = ConfigContainer( - # Model - MUST be set by each recipe before use - model=None, # type: ignore[arg-type] - # Training config - shorter training for PEFT - train=TrainingConfig( - train_iters=1000, - global_batch_size=128, - micro_batch_size=1, - ), - validation=ValidationConfig( - eval_interval=100, - eval_iters=32, - ), - # Optimizer and scheduler - optimizer=opt_cfg, - scheduler=scheduler_cfg, - # DDP config - minimal settings for PEFT - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), - # Dataset config - uses SQuAD with packed sequences by default - dataset=default_squad_config( - seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult - ), - # Logger config - logger=LoggerConfig( - log_interval=1, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - # Tokenizer - placeholder, each recipe should set tokenizer_model - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=None, # Must be set by each recipe - ), - # Checkpoint config with pretrained_checkpoint support - checkpoint=CheckpointConfig( - save_interval=100, - save=checkpoint_dir, - load=checkpoint_dir, - pretrained_checkpoint=None, # Set to load from pretrained weights - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - # RNG config - different seed from pretrain - rng=RNGConfig(seed=5678), - # Distributed init config - dist=DistributedInitConfig(), - comm_overlap=None, - # Mixed precision - bf16 by default - mixed_precision="bf16_mixed", - # LoRA config with standard defaults - peft=LoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=32, - alpha=32, - dropout=0.0, - dropout_position="pre", - lora_A_init_method="xavier", - lora_B_init_method="zero", - a2a_experimental=False, - lora_dtype=None, # Uses model's dtype - ), - ) - - return cfg - - -def _sft_common_vlm() -> ConfigContainer: - """Create a base SFT ConfigContainer with common defaults for Vision-Language Models. - - This function inherits from `_sft_common()` and overrides VLM-specific settings. - The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use. - - Key differences from LLM SFT (`_sft_common`): - - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2) - - Uses NullTokenizer (VLMs use processor instead of tokenizer) - - DDP config optimized for VLM training (no grad/param overlap) - - Supports freeze options for language_model, vision_model, vision_projection - - Different training defaults (train_iters=300000, GBS=32, MBS=2) - - Different RNG seed (1234) - - Returns: - ConfigContainer: Base configuration template for VLM full SFT. - """ - # Start from the LLM SFT common config - cfg = _sft_common() - - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for VLM - seq_length = 4096 - - # VLM-specific training config - longer training with different batch sizes - cfg.train.train_iters = 300000 - cfg.train.global_batch_size = 32 - cfg.train.micro_batch_size = 2 - cfg.train.manual_gc = True - cfg.train.manual_gc_interval = 100 - cfg.train.manual_gc_eval = 100 - - # VLM-specific validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - - # VLM-specific optimizer settings - higher LR for VLM training - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=500, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=3e-4, - min_lr=3e-5, - ) - cfg.optimizer = opt_cfg - cfg.scheduler = scheduler_cfg - - # VLM-specific DDP config - no overlap for VLMs - cfg.ddp = DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - ) - - # VLM-specific dataset - uses HuggingFace dataset provider - # hf_processor_path must be set by model-specific config - cfg.dataset = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=None, # Must be set by model-specific config - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - pack_sequences_in_batch=True, - ) - - # VLM uses NullTokenizer - actual tokenization is handled by the processor - cfg.tokenizer = TokenizerConfig( - tokenizer_type="NullTokenizer", - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE, - ) - - # VLM-specific logger config - cfg.logger = LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ) - - # VLM-specific checkpoint config - cfg.checkpoint.save_interval = 500 - cfg.checkpoint.save = checkpoint_dir - cfg.checkpoint.load = checkpoint_dir - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.fully_parallel_save = True - - # VLM uses different RNG seed - cfg.rng = RNGConfig(seed=1234) - - return cfg - - -def _peft_common_vlm() -> ConfigContainer: - """Create a base PEFT ConfigContainer with LoRA defaults for Vision-Language Models. - - This function inherits from `_peft_common()` and overrides VLM-specific settings. - The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use. - - Key differences from LLM PEFT (`_peft_common`): - - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2) - - Uses NullTokenizer (VLMs use processor instead of tokenizer) - - DDP config optimized for VLM training (no grad/param overlap) - - Supports freeze options for language_model, vision_model, vision_projection - - Different training defaults (train_iters=300000, GBS=32, MBS=2) - - Different RNG seed (1234) - - Higher LR (1e-4) for adapter training - - Returns: - ConfigContainer: Base configuration template for VLM PEFT with LoRA. - """ - # Start from the LLM PEFT common config - cfg = _peft_common() - - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for VLM - seq_length = 4096 - - # VLM-specific training config - longer training with different batch sizes - cfg.train.train_iters = 300000 - cfg.train.global_batch_size = 32 - cfg.train.micro_batch_size = 2 - cfg.train.manual_gc = True - cfg.train.manual_gc_interval = 100 - cfg.train.manual_gc_eval = 100 - - # VLM-specific validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - - # VLM-specific optimizer settings - higher LR for PEFT - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=500, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=1e-4, # Higher LR for adapter training - min_lr=1e-5, - ) - cfg.optimizer = opt_cfg - cfg.scheduler = scheduler_cfg - - # VLM-specific DDP config - no overlap for VLMs - cfg.ddp = DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - ) - - # VLM-specific dataset - uses HuggingFace dataset provider - # hf_processor_path must be set by model-specific config - cfg.dataset = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=None, # Must be set by model-specific config - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - pack_sequences_in_batch=True, - ) - - # VLM uses NullTokenizer - actual tokenization is handled by the processor - cfg.tokenizer = TokenizerConfig( - tokenizer_type="NullTokenizer", - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE, - ) - - # VLM-specific logger config - cfg.logger = LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ) - - # VLM-specific checkpoint config - cfg.checkpoint.save_interval = 500 - cfg.checkpoint.save = checkpoint_dir - cfg.checkpoint.load = checkpoint_dir - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.fully_parallel_save = True - - # VLM uses different RNG seed - cfg.rng = RNGConfig(seed=1234) - - # Keep LoRA config from _peft_common() - it's already set with standard defaults - - return cfg - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/hybrid-context-parallel.md -```md -# Hybrid / Hierarchical Context Parallel - -This page covers the stable Bridge-facing meaning of hierarchical context -parallelism, especially the `a2a+p2p` transport path and -`hierarchical_context_parallel_sizes`. - -For operational setup, code anchors, and verification commands, see -[skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md). - -## What It Is - -Context parallelism (CP) splits the input sequence across GPUs so each rank -processes a chunk. The GPUs must communicate KV data during attention. There are -several CP communication backends: - -| `cp_comm_type` | Mechanism | Async / Overlap | Constraint | -|---|---|---|---| -| `"p2p"` | Ring-exchange of KV chunks | Yes | None | -| `"all_gather"` | All-gather full KV before attention | No | None | -| `"a2a"` | All-to-all: scatter heads, gather full sequence (Ulysses-style) | N/A | **CP <= num_kv_heads** | -| `"a2a+p2p"` | Hierarchical: a2a within inner group, p2p across outer group | Partial (p2p part) | Requires `hierarchical_context_parallel_sizes` | - -**HCP (`a2a+p2p`)** exists to scale CP beyond the KV head count by combining -a2a (fast, head-parallel) on intra-node links with p2p (async, -sequence-parallel) on inter-node links. - -It is important to separate this from the upstream boolean -`hybrid_context_parallel`, which is a different feature for balancing packed or -variable-length workloads. The two concepts should not be treated as -interchangeable. - -### Why a2a is limited by KV heads - -a2a transposes the parallelism dimension: each rank trades its sequence chunk -for a subset of attention heads. After the all-to-all, every rank has the -**full sequence** but only `heads / CP` heads. This means: - -- `heads / CP` must be a positive integer. -- The bottleneck is KV heads (not Q heads), because in GQA the KV heads are the - indivisible unit. -- If the model has 8 KV heads, pure a2a supports at most CP=8. - -HCP breaks this limit by applying a2a only within a sub-group small enough to -fit within the KV head count. - -## When to Use It - -**Use HCP when ALL of these are true:** - -1. You need CP larger than `num_kv_heads / TP` (pure a2a won't fit). -2. You cannot (or don't want to) increase TP to shrink CP. -3. Your cluster has a clear bandwidth hierarchy (e.g., NVLink intra-node >> IB - inter-node). - -**Prefer pure `a2a` when:** - -- You can adjust TP so that `CP <= num_kv_heads / TP`. This is simpler, avoids - the p2p overhead, and often yields the same throughput with better memory - headroom. - -**Prefer pure `p2p` when:** - -- You have very few KV heads or want maximum CP flexibility. -- Your workload can hide the p2p latency behind compute (long sequences help). - -### Decision example - -Model: 8 KV heads. Cluster: 4 nodes x 8 GPUs. Goal: train 128K sequences. - -| Option | TP | CP | `cp_comm_type` | Notes | -|---|---|---|---|---| -| A | 1 | 16 | `a2a+p2p` with `[8,2]` | a2a intra-node (8 GPUs), p2p across 2 node-groups | -| B | 2 | 4 | `a2a` | CP=4 <= 8 KV heads. Simpler. Often same throughput. | -| C | 1 | 16 | `p2p` | Works but no a2a bandwidth benefit intra-node | - -In practice, **option B is usually preferred** -- benchmarks showed identical -throughput to option A with more memory headroom. - -It should be treated as an advanced feature rather than a default recommendation. - -## Stable Bridge Limitation - -The most important Bridge-specific limitation is that hierarchical context -parallelism is currently supported only on the MPU initialization path. - -In practice, that means: - -- `dist.use_decentralized_pg=False` is the supported Bridge path -- the decentralized process-group path should not be assumed to materialize HCP - groups - -## Stable Constraints - -The durable constraints are: - -- `hierarchical_context_parallel_sizes` must match - `context_parallel_size` multiplicatively -- the usual CP sequence-length divisibility rules still apply -- Transformer Engine version support matters for `a2a+p2p` - -## Recommendation Level - -Use hierarchical context parallelism in Bridge only when you intentionally want -that transport path and are prepared to validate execution-path details. It is -not yet the kind of feature that should be presented as universally safe across -all Bridge initialization modes. - -## Related Docs - -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/communication-overlap.md](communication-overlap.md) -- [skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md) -- [skills/perf-techniques/hybrid-context-parallel/card.yaml](../skills/perf-techniques/hybrid-context-parallel/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/profiling.md -```md -# Profiling - -Megatron Bridge provides built-in support for profiling training jobs using a range of performance analysis tools. These include NVIDIA Nsight Systems (Nsys) for workflow optimization, as well as PyTorch-based profilers and memory trackers to monitor performance and memory usage patterns during training. - -## ProfilingConfig Overview - -{py:class}`bridge.training.config.ProfilingConfig` is a dataclass that encapsulates profiling-related settings for training. It resides inside the overall {py:class}`bridge.training.config.ConfigContainer`, which represents the complete configuration for a training run. - - -### Profiling Options - -The configuration supports two mutually exclusive profiling options: - -- **NSys profiling** (`use_nsys_profiler`) -- **PyTorch profiling** (`use_pytorch_profiler`) - -You can enable one or the other, but not both at the same time. - - -### Step Range and Target Ranks - -All profiling modes allow you to configure: - -- **Step range**: `profile_step_start` and `profile_step_end` -- **Target ranks**: `profile_ranks` - -By default, profiling targets rank 0. You can specify multiple ranks to analyze different parts of your distributed training setup. - - -### Advanced Profiling Features - -The configuration includes options for recording tensor shapes (`record_shapes`) and enabling memory profiling (`record_memory_history`) with a customizable output path (`memory_snapshot_path`). These features offer deeper visibility into your model’s memory consumption and tensor-level operations during training. - - -## NSys Profiling - -NVIDIA Nsys is a system-wide performance analysis tool designed to help you tune and optimize CUDA applications. Megatron Bridge integrates with Nsys to enable profiling specific steps of your training job, making it easy to collect detailed performance data without manual instrumentation. - -```{note} -NSys profiling cannot be used with the `FaultTolerancePlugin` due to implementation conflicts. If both are enabled, the framework will automatically disable NSys profiling and emit a warning. -``` - -### Configure NSys Profiling - -Enable NSys profiling by setting `use_nsys_profiler=True` in your `ProfilingConfig`. The key configuration options include: - -```python -from megatron.bridge.training.config import ProfilingConfig - -# In your ConfigContainer setup, cfg is a ConfigContainer instance -cfg.profiling = ProfilingConfig( - use_nsys_profiler=True, - profile_step_start=10, - profile_step_end=15, - profile_ranks=[0, 1], # Profile first two ranks - record_shapes=False, # Optional: record tensor shapes -) -``` - -### Launch with NSys - -When using NSys profiling, launch your training script with the NSys command wrapper: - -```bash -nsys profile -s none -o -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python -``` - -Replace `` with your desired output path and `` with your training script. The `--capture-range=cudaProfilerApi` option ensures profiling is controlled by the framework's step range configuration. - -### Configure Profiling with the NeMo Run NSys Plugin - -Recipe users can leverage the {py:class}`bridge.recipes.run_plugins.NsysPlugin` to configure NSys profiling through NeMo Run executors. The plugin provides a convenient interface for setting up profiling without manually configuring the underlying NSys command. - -```python -import nemo_run as run -from megatron.bridge.recipes.run_plugins import NsysPlugin - -# Create your recipe and executor -recipe = your_recipe_function() -executor = run.SlurmExecutor(...) - -# Configure NSys profiling via plugin -plugins = [ - NsysPlugin( - profile_step_start=10, - profile_step_end=15, - profile_ranks=[0, 1], - nsys_trace=["nvtx", "cuda"], # Optional: specify trace events - record_shapes=False, - nsys_gpu_metrics=False, - ) -] - -# Run with profiling enabled -with run.Experiment("nsys_profiling_experiment") as exp: - exp.add(recipe, executor=executor, plugins=plugins) - exp.run() -``` - -The plugin automatically configures the NSys command line options and sets up the profiling configuration in your training job. - -### Analyze Results - -After your profiling run completes, the NSys profile files (`.nsys-rep`) will be generated. To analyze them, install [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems) from the NVIDIA Developer website, open the files in the NSys GUI, and use the timeline view to explore the performance characteristics of your training job. - -## PyTorch Profiler - -Megatron Bridge supports the built-in PyTorch profiler, which is useful for viewing profiles in TensorBoard and understanding PyTorch-level performance characteristics. - -### Configure PyTorch Profiler - -Enable PyTorch profiling by setting `use_pytorch_profiler=True` in your `ProfilingConfig`: - -```python -from megatron.bridge.training.config import ProfilingConfig - -cfg.profiling = ProfilingConfig( - use_pytorch_profiler=True, - profile_step_start=10, - profile_step_end=15, - profile_ranks=[0], - record_shapes=True, # Record tensor shapes for detailed analysis -) -``` - -### Configure Profiling with the PyTorch Profiler Plugin - -Similar to NSys, recipe users can use the {py:class}`bridge.recipes.run_plugins.PyTorchProfilerPlugin` for convenient configuration: - -```python -from megatron.bridge.recipes.run_plugins import PyTorchProfilerPlugin - -plugins = [ - PyTorchProfilerPlugin( - profile_step_start=10, - profile_step_end=15, - profile_ranks=[0], - record_memory_history=True, - memory_snapshot_path="memory_snapshot.pickle", - record_shapes=True, - ) -] -``` - -## Memory Profiling - -Megatron Bridge provides built-in support for CUDA memory profiling to track and analyze memory usage patterns during training, including GPU memory allocation and consumption tracking. - -More information about the generated memory profiles can be found [here](https://pytorch.org/blog/understanding-gpu-memory-1/). - -### Configure Memory Profiling - -Enable memory profiling by setting `record_memory_history=True` in your `ProfilingConfig`. This can be used with either profiling mode: - -```python -from megatron.bridge.training.config import ProfilingConfig - -cfg.profiling = ProfilingConfig( - use_pytorch_profiler=True, # or use_nsys_profiler=True - profile_step_start=10, - profile_step_end=15, - profile_ranks=[0], - record_memory_history=True, - memory_snapshot_path="memory_trace.pickle", # Customize output path -) -``` - -### Analyze Memory Usage - -After the run completes, memory snapshots for each specified rank are saved to the designated path. Load these traces using the PyTorch Memory Viz tool to plot memory usage over time and detect bottlenecks or leaks in your training pipeline. - -## Optimize Profiling Accuracy - -Profiling adds overhead to your training job, so measured timings may be slightly higher than normal operation. For accurate profiling results, disable other intensive operations like frequent checkpointing during the profiled step range. Choose your profiling step range carefully to capture representative training behavior while minimizing the performance impact on the overall job. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/README.md -```md -# Training Scripts - -Generic launcher and training scripts that work with any GPT-based model family (e.g. Deepseek, Llama, Gemma, Qwen, GPT, etc.). - -## Overview - -These scripts provide a generic interface for training GPT-based models in Megatron Bridge: - -- `run_recipe.py` - Generic pretraining/finetuning for GPT- and Mamba-based models. -- `launch_with_nemo_run.py` - NeMo-Run launcher (local or Slurm) -- `launch_with_sbatch.sh` - Direct sbatch launcher - -All scripts dynamically import recipes from `megatron.bridge.recipes`, apply user-provided overrides to the configuration, then begin training. - -## Quick Start - -For the end-to-end overview of how recipes are structured, overridden, and launched, see the official [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html). - -### Pretrain (single-GPU) - -```bash -uv run python run_recipe.py --recipe llama32_1b_pretrain_config -``` - -### Pretrain (multi-GPU) - -```bash -uv run torchrun --nproc_per_node=8 run_recipe.py --recipe llama32_1b_pretrain_config -``` - -### Finetune - -```bash -uv run torchrun --nproc_per_node=8 run_recipe.py --recipe llama32_1b_sft_config -``` - -## Usage with Different Models - -Same scripts work across all model families: - -```bash -# Llama -uv run torchrun --nproc_per_node=8 run_recipe.py --recipe llama32_1b_pretrain_config - -# Gemma -uv run torchrun --nproc_per_node=8 run_recipe.py --recipe gemma3_1b_pretrain_config - -# Qwen -uv run torchrun --nproc_per_node=8 run_recipe.py --recipe qwen3_8b_pretrain_config - -# GPT -uv run torchrun --nproc_per_node=8 run_recipe.py --recipe gpt_126m_pretrain_config -``` - -## CLI Overrides - -Override any config field using dot notation: - -```bash -uv run torchrun --nproc_per_node=8 run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - train.train_iters=5000 \ - optimizer.lr=0.0002 \ - model.tensor_model_parallel_size=2 -``` - -The first part before the dot specifies which ConfigContainer subconfig to override (e.g., `train`, `model`, `optimizer`), and the part after specifies the field. - -Configuration priority: -1. CLI overrides (highest) -2. Recipe defaults (lowest) - -Mode is inferred from the recipe name. If your recipe name doesn't include -`pretrain`, `finetune`, `sft`, or `peft`, pass `--mode` explicitly. - -## Step Function Selection - -Use `--step_func` to control the step function used during training. Available options: - -- `gpt_step` - Text-only models (default) -- `vlm_step` - Vision-language models -- `llava_step` - LLaVA models - -```bash -uv run torchrun --nproc_per_node=8 run_recipe.py \ - --recipe qwen25_vl_pretrain_config \ - --step_func vlm_step -``` - -## Multi-Node and Distributed Training - -### Option 1: NeMo-Run - -Prerequisites: - -```bash -pip install nemo-run -``` - -#### Test Locally First - -Before launching on Slurm, test your configuration locally: - -```bash -python launch_with_nemo_run.py \ - --local \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --devices 2 \ - --dry-run \ - train.train_iters=10 -``` - -This uses `LocalExecutor` with torchrun for single-node testing. Include `--dry-run` to confirm the composed nemo-run command before actually launching it. - -#### Launch on Slurm - -Once tested, scale to Slurm by removing `--local` and adding Slurm parameters: - -```bash -# From the cluster (LocalTunnel) -python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --nodes 2 \ - --devices 8 \ - --partition gpu \ - --account my_account - -# From your local machine (SSHTunnel) -python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --nodes 2 \ - --devices 8 \ - --partition gpu \ - --account my_account \ - --ssh-tunnel \ - --host my-cluster.example.com \ - --user myusername \ - --remote-job-dir /home/myusername/nemo-runs -``` - -#### With Containers - -When using containers, scripts are automatically packaged using `PatternPackager`: - -```bash -python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe qwen3_8b_pretrain_config \ - --nodes 4 \ - --devices 8 \ - --partition gpu \ - --account my_account \ - --container-image /path/to/container.sqsh \ - --mount /data:/data -``` - -> **Note:** PatternPackager only includes `scripts/training/*.py`. Local changes in -> `src/megatron/bridge/` stay on your workstation unless you mount the repo into -> the container. - -```bash -python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --nodes 2 \ - --partition gpu \ - --account my_account \ - --container-image /path/to/container.sqsh \ - --mount /path/to/your/Megatron-Bridge:/opt/Megatron-Bridge \ - train.train_iters=10 -``` - -Mounting onto `/opt/Megatron-Bridge` shadows the container's built-in source so -your edited `src/megatron/bridge/` files are used while packaged scripts still -run from the container workspace. - -For git-based packaging: - -```bash -python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama3_8b_pretrain_config \ - --nodes 2 \ - --partition gpu \ - --account my_account \ - --container-image /path/to/container.sqsh \ - --packager git -``` - -#### Fault-Tolerant Training - -Use the fault-tolerant launcher for better resiliency: - -```bash -python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --launcher ft \ - --nodes 2 \ - --partition gpu \ - --account my_account -``` - -### Option 2: Direct sbatch - -For traditional HPC workflows without NeMo-Run, use the `launch_with_sbatch.sh` script. - -Edit the configuration section in `launch_with_sbatch.sh`: - -```bash -# Training script to run -TRAINING_SCRIPT="run_recipe.py" - -# Recipe name -RECIPE="llama32_1b_pretrain_config" - -# Step function (controls the step function: gpt_step, vlm_step, or llava_step) -STEP_TYPE="gpt_step" - -# Optional: CLI overrides -CLI_OVERRIDES="train.train_iters=5000 optimizer.lr=0.0003" - -# Optional: Container settings -CONTAINER_IMAGE="/path/to/container.sqsh" -CONTAINER_MOUNTS="/data:/data /model:/model" -``` - -Also configure the SBATCH directives at the top of the file: - -```bash -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 -#SBATCH --partition=gpu -#SBATCH --account=my_account -#SBATCH --time=04:00:00 -``` - -Then submit: - -```bash -sbatch launch_with_sbatch.sh -``` - -The script automatically: -- Sets up multi-node torchrun with correct SLURM environment variables -- Passes recipe and CLI override arguments to the training script -- Handles container execution (if specified) -- Applies container mounts - -## Recipe Arguments - -Generic scripts call recipes with no arguments passed to the recipe function. - -All customization happens through CLI overrides after the config is built. - -If you need to pass arguments to the recipe constructor itself (e.g., custom parallelism at recipe build time), use model-specific examples or create a custom script. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/launch_with_nemo_run.py -```py -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Launch Training with NeMo-Run - -Generic launcher for training scripts. Supports local execution and Slurm clusters. - -Prerequisites: Install nemo-run - -Usage: - # Test locally (single node) - python launch_with_nemo_run.py \ - --local \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --devices 2 - - # Launch on Slurm from the cluster (LocalTunnel) - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --nodes 2 \ - --partition gpu \ - --account my_account - - # Launch on Slurm from your local machine (SSHTunnel) - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_sft_config \ - --nodes 1 \ - --partition gpu \ - --account my_account \ - --ssh-tunnel \ - --host my-cluster.example.com \ - --user myusername \ - --remote-job-dir /home/myusername/nemo-runs - - # With CLI overrides - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe gemma3_1b_pretrain_config \ - --nodes 1 \ - --partition gpu \ - --account my_account \ - train.train_iters=5000 \ - optimizer.lr=0.0002 - - # With containers (uses PatternPackager by default) - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe qwen3_8b_pretrain_config \ - --nodes 1 \ - --partition gpu \ - --account my_account \ - --container-image /path/to/container.sqsh \ - --mount /data:/data - - # With custom packager (git archive) - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama3_8b_pretrain_config \ - --nodes 2 \ - --partition gpu \ - --account my_account \ - --container-image /path/to/container.sqsh \ - --packager git - - # With environment variables (HF token, W&B key, etc.) - python launch_with_nemo_run.py \ - --script /opt/Megatron-Bridge/scripts/training/run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --nodes 1 \ - --partition gpu \ - --account my_account \ - --container-image /path/to/container.sqsh \ - --mount /path/to/Megatron-Bridge:/opt/Megatron-Bridge \ - --env HF_TOKEN=your_token \ - --env WANDB_API_KEY=your_key - - # With fault-tolerant launcher - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --launcher ft \ - --nodes 2 \ - --partition gpu \ - --account my_account - - # Wait for completion and tail logs - python launch_with_nemo_run.py \ - --script run_recipe.py \ - --recipe llama32_1b_pretrain_config \ - --nodes 1 \ - --partition gpu \ - --account my_account \ - --no-detach \ - --tail-logs - -Note: -- Use --local for single-node testing with LocalExecutor -- Use --ssh-tunnel when launching to Slurm from your local machine -- Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel) -- By default, jobs are submitted and detached (use --no-detach --tail-logs to monitor) -- With containers, scripts are auto-packaged using PatternPackager (or use --packager git) -- Any unknown arguments are forwarded to the training script -- Adjust cluster-specific settings (account, partition, container paths) -""" - -import argparse -import logging -from pathlib import Path - -import nemo_run as run - - -logger = logging.getLogger(__name__) - -SCRIPT_DIR = Path(__file__).parent.resolve() - - -def parse_args() -> tuple[argparse.Namespace, list[str]]: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Launch training with NeMo-Run (local or Slurm)", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--local", - action="store_true", - help="Run locally with LocalExecutor (single node). Omit for Slurm execution.", - ) - parser.add_argument( - "--script", - type=str, - required=True, - help="Training script to run (e.g., run_recipe.py, pretrain_vlm.py, finetune_vlm.py)", - ) - parser.add_argument( - "--recipe", - type=str, - required=True, - help="Recipe name (e.g., llama32_1b_pretrain_config)", - ) - parser.add_argument( - "--launcher", - type=str, - default="torchrun", - choices=["torchrun", "ft", "default"], - help="Launcher to use: 'torchrun', 'ft' (fault-tolerant), or 'default' (no launcher)", - ) - parser.add_argument( - "--devices", - type=int, - default=None, - help="GPUs per node. Required for --local. For Slurm, omit if cluster auto-allocates whole nodes.", - ) - parser.add_argument( - "--nodes", - type=int, - default=1, - help="Number of nodes to use (Slurm only, ignored for --local)", - ) - parser.add_argument( - "--partition", - type=str, - help="Slurm partition name (required for Slurm execution)", - ) - parser.add_argument( - "--account", - type=str, - help="Slurm account name (required for Slurm execution)", - ) - parser.add_argument( - "--time", - type=str, - default="04:00:00", - help="Job time limit", - ) - parser.add_argument( - "--gres", - type=str, - default=None, - help="Slurm GRES (e.g., 'gpu:8').", - ) - parser.add_argument( - "--ssh-tunnel", - action="store_true", - help="Use SSH tunnel (for launching from local machine). Requires --host, --user, --remote-job-dir", - ) - parser.add_argument( - "--host", - type=str, - help="SSH host for tunnel (required if --ssh-tunnel is set)", - ) - parser.add_argument( - "--user", - type=str, - help="SSH user for tunnel (required if --ssh-tunnel is set)", - ) - parser.add_argument( - "--remote-job-dir", - type=str, - help="Remote directory to store job files (required if --ssh-tunnel is set)", - ) - parser.add_argument( - "--identity", - type=str, - default=None, - help="Path to SSH private key for authentication", - ) - parser.add_argument( - "--container-image", - type=str, - default=None, - help="Container image path (Slurm only)", - ) - parser.add_argument( - "--mount", - type=str, - action="append", - default=[], - help="Container mounts in format host:container (can be specified multiple times)", - ) - parser.add_argument( - "--packager", - type=str, - default="none", - choices=["pattern", "git", "none"], - help="Code packaging method: 'none' (passthrough, use mounted/accessible code), " - "'pattern' (package *.py files), or 'git' (git archive).", - ) - parser.add_argument( - "--env", - type=str, - action="append", - default=[], - help="Environment variables in format KEY=VALUE (can be specified multiple times)", - ) - parser.add_argument( - "--experiment-name", - type=str, - default="megatron_bridge_training", - help="Name for the experiment", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Print what would be executed without submitting the job", - ) - parser.add_argument( - "--detach", - action=argparse.BooleanOptionalAction, - default=True, - help="Detach from the experiment after submission (use --no-detach to wait)", - ) - parser.add_argument( - "--tail-logs", - action="store_true", - help="Tail logs after submission (only works with --no-detach)", - ) - - args, forwarded_args = parser.parse_known_args() - return args, forwarded_args - - -def main() -> None: - """Launch training using NeMo-Run.""" - args, forwarded_args = parse_args() - - # Validate arguments based on execution mode - if args.local: - # Local execution - SSH tunnel args are not used - if args.ssh_tunnel: - raise ValueError("--ssh-tunnel cannot be used with --local") - if args.devices is None: - raise ValueError("--devices is required for --local execution") - else: - # Slurm execution - require partition and account - if not args.partition or not args.account: - raise ValueError("--partition and --account are required for Slurm execution (omit --local)") - - if args.ssh_tunnel: - if not all([args.host, args.user, args.remote_job_dir]): - raise ValueError("--ssh-tunnel requires --host, --user, and --remote-job-dir to be specified") - - # Validate script path (skip validation for absolute paths, assuming they're container paths) - if Path(args.script).is_absolute(): - # Absolute path - assume it's a container path or cluster path - script_path = Path(args.script) - task_script_path = str(script_path) - logger.info(f"Using absolute script path (container/cluster): {task_script_path}") - else: - # Relative path - resolve from SCRIPT_DIR and validate - script_path = SCRIPT_DIR / args.script - if not script_path.exists(): - raise FileNotFoundError(f"Training script not found: {script_path}") - - script_args = ["--recipe", args.recipe] - if forwarded_args: - script_args.extend(forwarded_args) - - # Determine packager - if args.packager == "pattern": - packager = run.PatternPackager(include_pattern="*.py", relative_path=str(SCRIPT_DIR)) - logger.info("Using PatternPackager") - # For pattern packager, use relative path - if not Path(args.script).is_absolute(): - task_script_path = args.script - elif args.packager == "git": - packager = run.GitArchivePackager(subpath="scripts/training") - logger.info("Using GitArchivePackager") - # For git packager, use relative path - if not Path(args.script).is_absolute(): - task_script_path = args.script - else: # none - packager = run.Packager() - logger.info("Using passthrough packager (no packaging)") - - task = run.Script( - path=task_script_path, - entrypoint="python", - args=script_args, - ) - - # Parse environment variables - env_vars = {} - for env_str in args.env: - if "=" not in env_str: - raise ValueError(f"Invalid env format: {env_str}. Expected KEY=VALUE") - key, value = env_str.split("=", 1) - env_vars[key] = value - - if env_vars: - logger.info(f"Setting environment variables: {list(env_vars.keys())}") - - launcher = None - if args.launcher == "torchrun": - launcher = "torchrun" - elif args.launcher == "ft": - launcher = "ft" - logger.debug("Using fault-tolerant launcher") - elif args.launcher == "default": - launcher = None - - if args.local: - logger.debug("Using LocalExecutor") - executor = run.LocalExecutor( - ntasks_per_node=args.devices, - launcher=launcher, - ) - if env_vars: - executor.env_vars = env_vars - else: - # Configure tunnel (SSH for remote, Local if already on cluster) - tunnel = None - if args.ssh_tunnel: - tunnel = run.SSHTunnel( - host=args.host, - user=args.user, - job_dir=args.remote_job_dir, - identity=args.identity, - ) - logger.debug(f"Using SSH tunnel to {args.user}@{args.host}") - else: - tunnel = run.LocalTunnel() - logger.debug("Using LocalTunnel (running on cluster)") - - # Create the Slurm executor - executor_kwargs = { - "account": args.account, - "partition": args.partition, - "nodes": args.nodes, - "mem": "0", - "exclusive": True, - "time": args.time, - "tunnel": tunnel, - "packager": packager, - } - - # Add devices only if specified - if args.devices is not None: - executor_kwargs["ntasks_per_node"] = args.devices - executor_kwargs["gpus_per_node"] = args.devices - - # Add gres only if explicitly specified - if args.gres: - executor_kwargs["gres"] = args.gres - - executor = run.SlurmExecutor(**executor_kwargs) - - # Configure container if specified - if args.container_image: - executor.container_image = args.container_image - - # Configure mounts if specified - if args.mount: - executor.container_mounts = args.mount - - # Set environment variables - if env_vars: - executor.env_vars = env_vars - - # Run the experiment - with run.Experiment(args.experiment_name) as exp: - exp.add(task, executor=executor, name="training") - - if args.dry_run: - exp.dryrun() - else: - exp.run(detach=args.detach, tail_logs=args.tail_logs) - - if args.detach: - if args.local: - logger.info("Job started locally!") - else: - logger.info("Job submitted to Slurm!") - logger.info("Use 'squeue' to check job status") - else: - logger.info("Job completed!") - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format="%(message)s") - main() - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/README.md -```md -# Megatron Bridge Documentation - -Welcome to the Megatron Bridge documentation! This guide helps you navigate our comprehensive documentation to find exactly what you need for training, converting, and working with large language models and vision language models. - -## 🚀 Quick Start Paths - -### I want to - -**🏃‍♂️ Get started with model conversion** -→ Start with [Bridge Guide](bridge-guide.md) for Hugging Face ↔ Megatron conversion - -**⚡ Understand parallelisms and performance** -→ Jump to [Parallelisms Guide](parallelisms.md) and [Performance Guide](performance-guide.md) - -**🚀 Start training a model** -→ See [Training Documentation](training/README.md) for comprehensive training guides - -**📚 Find model documentation** -→ Browse [Supported Models](models/llm/index.md) for LLMs or [Vision Language Models](models/vlm/index.md) for VLMs - -**🔧 Migrate from NeMo 2 or Megatron-LM** -→ Check [NeMo 2 Migration Guide](nemo2-migration-guide.md) or [Megatron-LM Migration Guide](megatron-lm-to-megatron-bridge.md) - -**📊 Use training recipes** -→ Read [Recipe Usage](recipe-usage.md) for pre-configured training recipes - -**🔌 Add support for a new model** -→ Refer to [Adding New Models](adding-new-models.md) - -**📋 Check version information** -→ See [Releases Documentation](releases/README.md) for versions, changelog, and known issues - ---- - -## 👥 Documentation by Role - -### For ML Engineers & Researchers - -- **Start here:** [Bridge Guide](bridge-guide.md) → [Training Documentation](training/README.md) -- **Deep dive:** [Performance Guide](performance-guide.md) → [Training Optimization Guides](training/README.md#optimization-and-performance) -- **Model support:** [Supported Models](models/llm/index.md) → [Adding New Models](adding-new-models.md) - -### For Training Engineers - -- **Start here:** [Training Documentation](training/README.md) → [Configuration Container Overview](training/config-container-overview.md) -- **Performance:** [Performance Guide](performance-guide.md) → [Performance Summary](performance-summary.md) -- **Parallelisms:** [Parallelisms Guide](parallelisms.md) → [Training Optimization](training/README.md#optimization-and-performance) - -### For Model Developers - -- **Start here:** [Bridge Guide](bridge-guide.md) → [Bridge Tech Details](bridge-tech-details.md) -- **Model support:** [Adding New Models](adding-new-models.md) → [Model Documentation](models/llm/index.md) -- **Integration:** [Bridge RL Integration](bridge-rl-integration.md) - -### For DevOps & Platform Teams - -- **Start here:** [Releases Documentation](releases/README.md) → [Software Versions](releases/software-versions.md) -- **Troubleshooting:** [Known Issues](releases/known-issues.md) -- **API Reference:** [API Documentation](apidocs/index.rst) - ---- - -## 📚 Complete Documentation Index - -### Getting Started - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Bridge Guide](bridge-guide.md)** | Hugging Face ↔ Megatron conversion guide | First time converting models | -| **[Bridge Tech Details](bridge-tech-details.md)** | Technical details of the bridge system | Understanding bridge internals | -| **[Parallelisms Guide](parallelisms.md)** | Data and model parallelism strategies | Setting up distributed training | -| **[Performance Summary](performance-summary.md)** | Quick performance reference | Quick performance lookup | -| **[Performance Guide](performance-guide.md)** | Comprehensive performance optimization | Optimizing training performance | - -### Model Support - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Large Language Models](models/llm/index.md)** | LLM model documentation | Working with LLM models | -| **[Vision Language Models](models/vlm/index.md)** | VLM model documentation | Working with VLM models | -| **[Adding New Models](adding-new-models.md)** | Guide for adding model support | Extending model support | - -### Training and Customization - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Training Documentation](training/README.md)** | Comprehensive training guides | Setting up and customizing training | -| **[Configuration Container Overview](training/config-container-overview.md)** | Central training configuration | Understanding training configuration | -| **[Entry Points](training/entry-points.md)** | Training entry points and execution | Understanding training flow | -| **[Training Loop Settings](training/training-loop-settings.md)** | Training loop parameters | Configuring training parameters | -| **[Optimizer & Scheduler](training/optimizer-scheduler.md)** | Optimization configuration | Setting up optimizers | -| **[Mixed Precision](training/mixed-precision.md)** | Mixed precision training | Reducing memory usage | -| **[PEFT](training/peft.md)** | Parameter-efficient fine-tuning | Fine-tuning with limited resources | -| **[Checkpointing](training/checkpointing.md)** | Checkpoint management | Saving and resuming training | -| **[Logging](training/logging.md)** | Logging and monitoring | Monitoring training progress | -| **[Profiling](training/profiling.md)** | Performance profiling | Identifying bottlenecks | - -### Recipes and Workflows - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Recipe Usage](recipe-usage.md)** | Using pre-configured training recipes | Quick training setup | -| **[Bridge RL Integration](bridge-rl-integration.md)** | Reinforcement learning integration | RL training workflows | - -### Migration Guides - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[NeMo 2 Migration Guide](nemo2-migration-guide.md)** | Migrating from NeMo 2 | Upgrading from NeMo 2 | -| **[Megatron-LM Migration Guide](megatron-lm-to-megatron-bridge.md)** | Migrating from Megatron-LM | Upgrading from Megatron-LM | - -### Reference - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[API Documentation](apidocs/index.rst)** | Complete API reference | Building integrations | -| **[Releases Documentation](releases/README.md)** | Version history and known issues | Checking versions, troubleshooting | -| **[Documentation Guide](documentation.md)** | Contributing to documentation | Contributing docs | - ---- - -## 🗺️ Common Reading Paths - -### 🆕 First-Time Users - -1. [Bridge Guide](bridge-guide.md) *(10 min - understand conversion)* -2. [Parallelisms Guide](parallelisms.md) *(15 min - understand distributed training)* -3. [Training Documentation](training/README.md) *(choose your training path)* -4. [Recipe Usage](recipe-usage.md) *(5 min - use pre-configured recipes)* - -### 🔧 Setting Up Training - -1. [Training Documentation](training/README.md) *(overview of training system)* -2. [Configuration Container Overview](training/config-container-overview.md) *(understand configuration)* -3. [Entry Points](training/entry-points.md) *(how training starts)* -4. [Training Loop Settings](training/training-loop-settings.md) *(configure parameters)* -5. [Logging](training/logging.md) *(set up monitoring)* - -### ⚡ Performance Optimization - -1. [Performance Guide](performance-guide.md) *(comprehensive optimization strategies)* -2. [Performance Summary](performance-summary.md) *(quick reference)* -3. [Mixed Precision](training/mixed-precision.md) *(reduce memory usage)* -4. [Communication Overlap](training/communication-overlap.md) *(optimize distributed training)* -5. [Activation Recomputation](training/activation-recomputation.md) *(reduce memory footprint)* -6. [Profiling](training/profiling.md) *(identify bottlenecks)* - -### 🔄 Model Conversion Workflow - -1. [Bridge Guide](bridge-guide.md) *(conversion basics)* -2. [Bridge Tech Details](bridge-tech-details.md) *(technical details)* -3. [Supported Models](models/llm/index.md) or [Vision Language Models](models/vlm/index.md) *(model-specific guides)* -4. [Adding New Models](adding-new-models.md) *(extend support)* - -### 🔧 Customization and Extension - -1. [Training Documentation](training/README.md) *(training customization)* -2. [PEFT](training/peft.md) *(parameter-efficient fine-tuning)* -3. [Distillation](training/distillation.md) *(knowledge distillation)* -4. [Adding New Models](adding-new-models.md) *(add model support)* -5. [Bridge RL Integration](bridge-rl-integration.md) *(RL workflows)* - -### 📦 Migration Paths - -1. [NeMo 2 Migration Guide](nemo2-migration-guide.md) *(from NeMo 2)* -2. [Megatron-LM Migration Guide](megatron-lm-to-megatron-bridge.md) *(from Megatron-LM)* -3. [Training Documentation](training/README.md) *(new training system)* - ---- - -## 📁 Directory Structure - -### Main Documentation - -- **Guides** - Core guides for parallelisms, performance, recipes, and migration -- **Bridge Documentation** - Hugging Face ↔ Megatron conversion guides -- **Model Documentation** - Supported model families and architectures - -### Subdirectories - -#### [models/](models/README.md) - -- **[llm/](models/llm/README.md)** - Large Language Model documentation - - Individual model guides (Qwen, LLaMA, Mistral, etc.) - - Conversion examples and training recipes -- **[vlm/](models/vlm/README.md)** - Vision Language Model documentation - - VLM model guides (Qwen VL, Gemma VL, etc.) - - Multimodal model support - -#### [training/](training/README.md) - -- **Configuration** - ConfigContainer, entry points, training loop settings -- **Optimization** - Optimizer, scheduler, mixed precision, communication overlap -- **Performance** - Attention optimizations, activation recomputation, CPU offloading -- **Monitoring** - Logging, profiling, checkpointing, resiliency -- **Advanced** - PEFT, packed sequences, distillation - -#### [releases/](releases/README.md) - -- **Software Versions** - Current versions and dependencies -- **Changelog** - Release history and changes -- **Known Issues** - Bugs, limitations, and workarounds - ---- - -## 🔗 How Documents Connect - -```mermaid -graph TD - A[README.md
Start Here] --> B[Bridge Guide
Model Conversion] - A --> C[Training Docs
Training Setup] - A --> D[Models
Model Support] - - B --> E[Bridge Tech Details
Technical Deep Dive] - B --> F[Supported Models
Model-Specific Guides] - - C --> G[Config Container
Configuration] - C --> H[Performance Guide
Optimization] - C --> I[Parallelisms
Distributed Training] - - G --> J[Training Loop
Training Parameters] - G --> K[Optimizer & Scheduler
Optimization Setup] - - H --> L[Mixed Precision
Memory Efficiency] - H --> M[Communication Overlap
Performance] - - I --> N[Data Parallelism
DDP] - I --> O[Model Parallelism
TP/PP/VPP] - - D --> P[LLM Models
Language Models] - D --> Q[VLM Models
Vision Language Models] - - style A fill:#e1f5fe - style B fill:#f3e5f5 - style C fill:#e8f5e8 - style D fill:#fff3e0 - style H fill:#fce4ec - style I fill:#e0f2f1 -``` - ---- - -## 🤝 Getting Help - -- **GitHub Issues:** [Report bugs or request features](https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues) -- **Documentation Issues:** Found something unclear? Let us know! -- **Community:** Join discussions and share experiences - ---- - -## 📖 Additional Resources - -- **[Examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples)** - Code examples and tutorials -- **[Contributing Guide](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/CONTRIBUTING.md)** - How to contribute to the project -- **[API Documentation](apidocs/index.rst)** - Complete API reference - ---- - -**Ready to get started?** Choose your path above or dive into the [Bridge Guide](bridge-guide.md) for model conversion! 🚀 - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/performance-summary.md -```md -# Performance - -As part of the NVIDIA NeMo Framework, Megatron Bridge, provides optimal performance for training advanced generative AI models by incorporating the most recent training techniques, such as model parallelization, optimized attention mechanisms, and more, to achieve high training throughput. - -This page provides performance benchmarks for large language models using Megatron-Bridge across different GPU systems and configurations. - -## Nomenclature - -- **GBS**: Global Batch Size -- **MBS**: Micro Batch Size -- **FSDP**: Fully Sharded Data Parallel - - FSDP > 0: use FSDP with sharding group size = #GPUs / (TP × PP) - - FSDP = 0: use DDP (Distributed Data Parallel) -- **TP**: Tensor Parallel Size -- **PP**: Pipeline Parallel Size -- **CP**: Context Parallel Size -- **VP**: Virtual Pipeline Parallel Size -- **EP**: Expert Parallel Size -- **GA**: Number of Gradient Accumulations - -## Performance Metrics - -Performance is measured using: - -- **Tokens/sec/GPU**: Throughput per GPU -- **Model TFLOP/sec/GPU**: Model floating-point operations per second per GPU - -## Performance Summary for Large Language Models - -Below are performance benchmarks for various large language models. These results were obtained using performance recipes available [here](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/scripts/performance). - -The performance data includes: - -- **Pre-training Performance**: Throughput metrics for various model sizes and architectures -- **System Configurations**: Results across different GPU systems (DGX-GB300, DGX-GB200, DGX-B300, DGX-B200, DGX-H100) -- **Precision Options**: Performance comparisons between different precision modes (BF16, FP8, MXFP8) - ---- - -## 26.02.01 NeMo Container - -### Pre-Training Performance - -#### Model: LLAMA3_70B - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 64 | NVFP4 | 256 | 2 | 8192 | 0 | 1 | 1 | 1 | n/a | n/a | 7002 | 3147 | -| DGX-GB200 | 64 | NVFP4 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 4557 | 2047 | -| DGX-GB300 | 64 | MXFP8 | 256 | 2 | 8192 | 0 | 1 | 4 | 1 | n/a | n/a | 4798 | 2157 | -| DGX-GB200 | 64 | MXFP8 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 3837 | 1724 | -| DGX-GB300 | 64 | FP8 | 256 | 2 | 8192 | 64 | 1 | 1 | 1 | n/a | n/a | 5243 | 2353 | -| DGX-GB200 | 64 | FP8 | 256 | 2 | 8192 | 64 | 1 | 1 | 1 | n/a | n/a | 4357 | 1956 | -| DGX-H100 | 64 | FP8 | 256 | 1 | 8192 | 0 | 4 | 8 | 1 | 5 | n/a | 1639 | 736 | - -#### Model: LLAMA3.1_405B - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 8 | 1 | 4 | n/a | 1358 | 3428 | -| DGX-GB200 | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 4 | n/a | 1083 | 2734 | -| DGX-GB300 | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 2 | 8 | 2 | 4 | n/a | 949 | 2394 | -| DGX-GB200 | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 8 | n/a | 775 | 1957 | -| DGX-GB300 | 256 | FP8 | 1536 | 1 | 8192 | 0 | 2 | 8 | 2 | 4 | n/a | 1024 | 2585 | -| DGX-GB200 | 256 | FP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 4 | n/a | 818 | 2063 | - -#### Model: DeepSeekV3 - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 256 | MXFP8 | 4096 | 2 | 4096 | 0 | 1 | 2 | 1 | 8 | 32 | 4691 | 1219 | -| DGX-GB200 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 4 | 1 | 4 | 64 | 4021 | 1046 | -| DGX-B300 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 3099 | 806 | -| DGX-B200 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 2790 | 725 | - -#### Model: GPT OSS 120B - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 19366 | 526 | -| DGX-GB200 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 15754 | 428 | -| DGX-B300 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 15031 | 412 | -| DGX-B200 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 13722 | 373 | -| DGX-H100 | 64 | BF16 | 1280 | 1 | 4096 | 0 | 1 | 4 | 1 | n/a | 8 | 5984 | 163 | - -#### Model: Qwen3_30B_a3B - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 30411 | 700 | -| DGX-GB200 | 8 | MXFP8 | 512 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 26373 | 607 | -| DGX-B300 | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 29454 | 678 | -| DGX-B200 | 8 | MXFP8 | 512 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 26695 | 614 | -| DGX-H100 | 16 | FP8 | 1024 | 1 | 4096 | 0 | 1 | 2 | 1 | 12 | 8 | 9058 | 208 | - -#### Model: Qwen3_235B_a22B - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 256 | MXFP8 | 8192 | 2 | 4096 | 0 | 1 | 4 | 1 | n/a | 32 | 6583 | 974 | -| DGX-GB200 | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | n/a | 32 | 5530 | 819 | -| DGX-B300 | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | 4 | 8 | 2644 | 391 | -| DGX-H100 | 256 | FP8 | 8192 | 1 | 4096 | 0 | 2 | 8 | 1 | 4 | 32 | 1611 | 238 | - -#### Model: Nemotron_3_Nano - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 8 | MXFP8 | 512 | 4 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 37664 | 839 | -| DGX-GB200 | 8 | MXFP8 | 512 | 2 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 33934 | 756 | -| DGX-B300 | 8 | MXFP8 | 512 | 4 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 35861 | 798 | -| DGX-H100 | 16 | FP8 | 1024 | 1 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 14890 | 331 | - -#### Model: Kimi_K2 - -| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU | -|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------| -| DGX-GB300 | 256 | MXFP8 | 4096 | 2 | 4096 | 0 | 1 | 4 | 1 | 4 | 64 | 5072 | 1037 | - -- Muon optimizer was used for pre-training Kimi-K2. - -- In MoE training benchmarks, we force-balance the token distribution among experts and all benchmarks are token-dropless. - -## Archive - -Performance summary for past releases can be found in the [archive](performance-summary-archive.md). -``` - -File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/run_recipe.py -```py -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Generic Training Script for LLM and diffusion models - -This script works with any model family that uses GPT-style training -(Llama, Gemma, Qwen, GPT, etc.) and with diffusion models (e.g. FLUX, WAN). It dynamically loads recipes and supports -CLI overrides. The --dataset flag selects the dataset type and automatically -infers pretrain vs finetune mode. - -Usage: - Pretrain (mock data): - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe llama32_1b_pretrain_config \\ - --dataset llm-pretrain-mock - - Pretrain (real data): - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe llama32_1b_pretrain_config \\ - --dataset llm-pretrain \\ - 'dataset.blend=[[/data/my_dataset_text_document],null]' - - Finetune (SQuAD, default): - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe llama32_1b_sft_config \\ - --dataset llm-finetune - - Finetune (GSM8K): - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe llama32_1b_sft_config \\ - --dataset llm-finetune \\ - dataset.dataset_name=gsm8k - - Finetune (user-supplied JSONL): - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe llama32_1b_sft_config \\ - --dataset llm-finetune-preloaded \\ - dataset.dataset_root=/data/my_finetune_data - - Diffusion pretrain: - uv run torchrun --nproc_per_node=8 run_recipe.py \ - --recipe wan_1_3B_pretrain_config \ - --step_func wan_step \ - dataset.path=/data/energon - - Diffusion SFT (full finetuning): - uv run torchrun --nproc_per_node=8 run_recipe.py \ - --recipe wan_1_3B_sft_config \ - --step_func wan_step - dataset.path=/data/energon - - VLM with HF dataset: - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe qwen3_vl_8b_peft_config \\ - --dataset vlm-hf \\ - --step_func qwen3_vl_step \\ - dataset.maker_name=cord_v2 \\ - dataset.hf_processor_path=Qwen/Qwen3-VL-8B-Instruct \\ - checkpoint.pretrained_checkpoint=/path/to/checkpoint - - VLM with Energon dataset: - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe qwen3_vl_8b_peft_energon_config \\ - --dataset vlm-energon \\ - --step_func qwen3_vl_step \\ - dataset.path=/data/energon \\ - checkpoint.pretrained_checkpoint=/path/to/checkpoint - - VLM with preloaded JSON: - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe qwen3_vl_8b_peft_config \\ - --dataset vlm-preloaded \\ - --step_func qwen3_vl_step \\ - dataset.train_data_path=/data/vlm_train.json \\ - dataset.image_folder=/data/vlm_images \\ - dataset.hf_processor_path=Qwen/Qwen3-VL-8B-Instruct \\ - checkpoint.pretrained_checkpoint=/path/to/checkpoint - - With CLI overrides (Hydra-style, works for any config field): - uv run torchrun --nproc_per_node=8 run_recipe.py \\ - --recipe llama32_1b_pretrain_config \\ - --dataset llm-pretrain-mock \\ - train.train_iters=5000 \\ - optimizer.lr=0.0003 - -Recipe Arguments: - Generic scripts call recipes with no arguments: recipe(). - - If you need to pass arguments to the recipe constructor - (e.g., custom parallelism at build time), create a custom script. -""" - -import argparse -import inspect -from typing import Callable - -import megatron.bridge.recipes as recipes - -# Diffusion forward steps: use class instances so they can be passed as forward_step_func -from megatron.bridge.diffusion.models.flux.flux_step import FluxForwardStep -from megatron.bridge.diffusion.models.wan.wan_step import WanForwardStep -from megatron.bridge.models.qwen_vl.qwen3_vl_step import forward_step as qwen3_vl_forward_step -from megatron.bridge.recipes.utils.dataset_utils import ( - DATASET_TYPES, - apply_dataset_override, - infer_mode_from_dataset, -) -from megatron.bridge.training.audio_lm_step import forward_step as audio_lm_forward_step -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.finetune import finetune -from megatron.bridge.training.gpt_step import forward_step as gpt_forward_step -from megatron.bridge.training.llava_step import forward_step as llava_forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.training.utils.omegaconf_utils import process_config_with_overrides -from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step - - -STEP_FUNCTIONS: dict[str, Callable] = { - "audio_lm_step": audio_lm_forward_step, - "gpt_step": gpt_forward_step, - "vlm_step": vlm_forward_step, - "qwen3_vl_step": qwen3_vl_forward_step, - "llava_step": llava_forward_step, - "flux_step": FluxForwardStep, - "wan_step": WanForwardStep, -} - -TRAIN_FUNCTIONS = { - "pretrain": pretrain, - "finetune": finetune, -} - -ERR_UNKNOWN_STEP = "Unknown step type: {step_type}. Choose from: {choices}" -ERR_INFER_MODE_FAILED = ( - "Unable to infer training mode. " - "Pass --dataset to specify the dataset type, or include 'pretrain' or 'finetune' " - "(or 'sft'/'peft') in the recipe name." -) - - -def parse_args() -> tuple[argparse.Namespace, list[str]]: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Generic training script for LLM and diffusion models", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--recipe", - type=str, - required=True, - help="Recipe function name (e.g., llama32_1b_pretrain_config, gemma3_1b_sft_config, gemma3_1b_peft_config)", - ) - parser.add_argument( - "--dataset", - type=str, - default=None, - choices=DATASET_TYPES, - help=( - "Dataset type. Training mode (pretrain/finetune) is inferred from this.\n" - "LLM datasets:\n" - " llm-pretrain GPT pretrain data (set dataset.blend=)\n" - " llm-pretrain-mock Mock pretrain data for testing\n" - " llm-finetune HF finetune dataset (set dataset.dataset_name=squad|gsm8k|openmathinstruct2)\n" - " llm-finetune-preloaded User-supplied JSONL (set dataset.dataset_root=)\n" - "VLM datasets:\n" - " vlm-energon Energon multimodal (set dataset.path=)\n" - " vlm-hf HF VLM dataset (set dataset.maker_name=)\n" - " vlm-preloaded User-supplied VLM JSON (set dataset.train_data_path=)" - ), - ) - parser.add_argument( - "--step_func", - type=str, - default="gpt_step", - choices=sorted(STEP_FUNCTIONS.keys()), - help="Step function: gpt_step (text-only), vlm_step (vision-language), llava_step (LLaVA), " - "flux_step (FLUX diffusion), wan_step (WAN diffusion, hyperparameters selected by --mode/recipe name)", - ) - parser.add_argument( - "--peft_scheme", - type=str, - default=None, - help="PEFT scheme to use: 'lora', 'dora', or None.", - ) - parser.add_argument( - "--packed_sequence", - action="store_true", - default=False, - help="Enable packed sequence training (default: False)", - ) - parser.add_argument( - "--seq_length", - type=int, - default=None, - help="Sequence length for training", - ) - parser.add_argument( - "--hf_path", - type=str, - default=None, - help="HuggingFace model ID or local path to model directory. " - "Use a local path for more stable multinode training.", - ) - args, cli_overrides = parser.parse_known_args() - return args, cli_overrides - - -def load_recipe( - recipe_name: str, - peft_scheme: str | None, - packed_sequence: bool = False, - seq_length: int | None = None, - hf_path: str | None = None, -) -> ConfigContainer: - """ - Load recipe by name from megatron.bridge.recipes. - - Args: - recipe_name: Full recipe function name (e.g., 'llama32_1b_pretrain_config') - peft_scheme: PEFT scheme to use ('lora', 'dora', or None) - packed_sequence: Enable packed sequence training (default: False) - seq_length: Sequence length for training (optional) - hf_path: HuggingFace model ID or local path to model directory (optional) - - Returns: - ConfigContainer from calling the recipe - - Raises: - AttributeError: If recipe not found - """ - if not hasattr(recipes, recipe_name): - raise AttributeError( - f"Recipe '{recipe_name}' not found in megatron.bridge.recipes.\n" - f"Make sure the recipe name is correct and the recipe is exported in its family __init__.py.\n" - f"Example recipe names: llama32_1b_pretrain_config, gemma3_1b_pretrain_config, qwen3_8b_pretrain_config" - ) - - config_builder = getattr(recipes, recipe_name) - - # Inspect the recipe's signature to determine which arguments it accepts - try: - sig = inspect.signature(config_builder) - params = sig.parameters - has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()) - - accepts_peft = "peft" in params or has_var_keyword - accepts_packed_sequence = "packed_sequence" in params or has_var_keyword - accepts_seq_length = "seq_length" in params or has_var_keyword - accepts_hf_path = "hf_path" in params or has_var_keyword - except (ValueError, TypeError): - # If signature inspection fails, fallback conservatively - accepts_peft = True # peft is widely supported, try passing it - accepts_packed_sequence = False # new parameter, don't pass if unsure - accepts_seq_length = False # new parameter, don't pass if unsure - accepts_hf_path = False # model-specific, don't pass if unsure - - # Build kwargs dynamically based on what the recipe accepts - kwargs = {} - if accepts_peft: - kwargs["peft"] = peft_scheme - if accepts_packed_sequence and packed_sequence: - kwargs["packed_sequence"] = packed_sequence - if accepts_seq_length and seq_length is not None: - kwargs["seq_length"] = seq_length - if accepts_hf_path and hf_path is not None: - kwargs["hf_path"] = hf_path - - try: - return config_builder(**kwargs) - except TypeError: - # Fallback if the kwargs are not accepted despite signature inspection - return config_builder() - - -def load_forward_step(step_type: str, mode: str | None = None) -> Callable: - """Load forward_step function based on the requested step type.""" - step_key = step_type.lower() - if step_key not in STEP_FUNCTIONS: - raise ValueError(ERR_UNKNOWN_STEP.format(step_type=step_type, choices=", ".join(STEP_FUNCTIONS))) - step = STEP_FUNCTIONS[step_key] - if inspect.isclass(step): - if "mode" in inspect.signature(step.__init__).parameters: - return step(mode=mode) - return step() - return step - - -def infer_train_mode(recipe_name: str) -> str: - """Infer training mode from the recipe name (fallback when --dataset is not passed).""" - lowered = recipe_name.lower() - has_pretrain = "pretrain" in lowered - has_finetune = "finetune" in lowered or "sft" in lowered or "peft" in lowered - if has_pretrain ^ has_finetune: - return "pretrain" if has_pretrain else "finetune" - raise ValueError(ERR_INFER_MODE_FAILED) - - -def main() -> None: - """Run GPT training (pretrain or finetune).""" - args, cli_overrides = parse_args() - - config: ConfigContainer = load_recipe( - args.recipe, - args.peft_scheme, - args.packed_sequence, - args.seq_length, - args.hf_path, - ) - - if args.dataset is not None: - mode = infer_mode_from_dataset(args.dataset) - config = apply_dataset_override( - config, - dataset_type=args.dataset, - packed_sequence=args.packed_sequence, - seq_length=args.seq_length, - cli_overrides=cli_overrides, - ) - else: - mode = infer_train_mode(args.recipe) - - config = process_config_with_overrides( - config, - cli_overrides=cli_overrides or None, - ) - - # Ensure dataset.seq_length and model.seq_length stay in sync after CLI overrides - if ( - hasattr(config, "model") - and config.model is not None - and hasattr(config, "dataset") - and config.dataset is not None - ): - if hasattr(config.dataset, "seq_length") and config.model.seq_length != config.dataset.seq_length: - config.model.seq_length = config.dataset.seq_length - - forward_step = load_forward_step(args.step_func, mode=mode) - train_func = TRAIN_FUNCTIONS[mode] - train_func(config=config, forward_step_func=forward_step) - - -if __name__ == "__main__": - main() - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/recipe-usage.md -```md -# Using Recipes - -Megatron Bridge provides production-ready training recipes for several popular models. You can find an overview of supported recipes and 🤗 HuggingFace bridges [here](index.md#supported-models). -This guide will cover the next steps to make use of a training recipe, including how to [override configuration](#overriding-configuration) and how to [launch a job](#launch-methods). - -## Overview - -- **Coverage**: We provide recipes across select model families and sizes, including Llama, Qwen, DeepSeek, and Nemotron-H (Mamba-based). -- **Defaults**: Each recipe sets defaults meant for convergence and performance across parallelisms, precision data types, and optimizer & scheduler choices. These recipes can be used as a high-quality starting point. -- **Integration**: Recipes return a single `ConfigContainer` that plugs directly into our training [entry points](training/entry-points.md) (see the published docs as well: https://docs.nvidia.com/nemo/megatron-bridge/latest/training/entry-points.html). -- **Customization**: You can override any part of the recipe (Python, YAML, CLI) to adapt to your data, scale, and objectives. - -## Overriding configuration - -Recipes are provided through a {py:class}`~bridge.training.config.ConfigContainer` object. This is a dataclass that holds all configuration objects needed for training. You can find a more detailed overview of the `ConfigContainer` [here](training/config-container-overview.md). -The benefit of providing the full recipe through a pythonic structure is that it is agnostic to any configuration approach that a user may prefer, whether that's YAML, `argparse` or something else. In other words, the user may override the recipe however they see fit. - -The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b.py). - - -### Python - -If you prefer to manage configuration in Python, you can directly modify attributes of the `ConfigContainer`: - -```python -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config - -# Get the base ConfigContainer from the recipe -cfg: ConfigContainer = pretrain_config() - -# Apply overrides. Note the hierarchical structure -cfg.train.train_iters = 20 -cfg.train.global_batch_size = 8 -cfg.train.micro_batch_size = 1 -cfg.logger.log_interval = 1 -``` - -You can also replace entire sub-configs of the `ConfigContainer`: - -```python -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config -from megatron.bridge.models.llama import Llama3ModelProvider - -cfg: ConfigContainer = pretrain_config() - -small_llama = Llama3ModelProvider( - num_layers=2, - hidden_size=768, - ffn_hidden_size=2688, - num_attention_heads=16, -) -cfg.model = small_llama -``` - -### YAML -Overriding a configuration recipe with a YAML file can be done using OmegaConf utilities: - -```python -from omegaconf import OmegaConf -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, -) - -cfg: ConfigContainer = pretrain_config() -yaml_filepath = "conf/llama3-8b-benchmark-cfg.yaml" - -# Convert the initial Python dataclass to an OmegaConf DictConfig for merging -# excluded_fields holds some configuration that cannot be serialized into a DictConfig -merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - -# Load and merge YAML overrides -yaml_overrides_omega = OmegaConf.load(yaml_filepath) -merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega) - -# Apply overrides while preserving excluded fields -final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) -apply_overrides(cfg, final_overrides_as_dict, excluded_fields) -``` - -The above snippet will update `cfg` with all overrides from `llama3-8b-benchmark-cfg.yaml`. - -### Hydra-style - -Megatron Bridge provides some utilities to update the ConfigContainer using Hydra-style CLI overrides: - -```python -import sys -from omegaconf import OmegaConf -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, - parse_hydra_overrides, -) - -cfg: ConfigContainer = pretrain_config() -cli_overrides = sys.argv[1:] - -# Convert the initial Python dataclass to an OmegaConf DictConfig for merging -# excluded_fields holds some configuration that cannot be serialized into a DictConfig -merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - -# Parse and merge CLI overrides -merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides) - -# Apply overrides while preserving excluded fields -final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) -apply_overrides(cfg, final_overrides_as_dict, excluded_fields) -``` - -After the above snippet, `cfg` will be updated with all CLI-provided overrides. -A script containing the above code could be called like so: - -```sh -torchrun pretrain_cli_overrides.py model.tensor_model_parallel_size=4 train.train_iters=100000 ... -``` - -## Launch methods - -Megatron Bridge supports launching scripts with both `torchrun` and [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). -Once your script is ready to be launched, refer to one of the following sections. - -### Torchrun -Megatron Bridge training scripts can be launched with the `torchrun` command that most PyTorch users are familiar with. -Simply specify the number of GPUs to use with `--nproc-per-node` and the number of nodes with `--nnodes`. For example, on a single node: - -```sh -torchrun --nnodes 1 --nproc-per-node 8 /path/to/train/script.py -``` - -For multi-node training, it is recommended to use a cluster orchestration system like SLURM. -The `torchrun` command should be wrapped as specified by your cluster orchestration system. -For example, with Slurm, wrap the `torchrun` command inside of `srun`: - -```sh -# launch.sub - -srun --nodes 2 --gpus-per-node 8 \ - --container-image --container-mounts \ - bash -c " - torchrun --nnodes $SLURM_NNODES --nproc-per-node $SLURM_GPUS_PER_NODE /path/to/train/script.py - " -``` - -Along with any other required flags. It is also recommended to use a NeMo Framework container with Slurm. You can find a list of container tags on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags). - -### NeMo-Run - -Megatron Bridge also supports launching training with [NeMo-Run](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html). NeMo-Run is a Python package that enables configuring and executing experiments across several platforms. -For multi-node training, NeMo-Run will generate a script with appropriate commands, similar to the `srun` command described above. - -The recommended method to launch a Megatron Bridge script with NeMo-Run is through the `run.Script` API. -You can modify the following 3 steps to your needs in a new file: - -```python -import nemo_run as run - -if __name__ == "__main__": - # 1) Configure the `run.Script` object - train_script = run.Script(path="/path/to/train/script.py", entrypoint="python") - - # 2) Define an executor for the desired target platform - executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") - - # 3) Execute - run.run(train_script, executor=executor) -``` - -NeMo-Run supports launching on several different platforms, including [SLURM clusters](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#slurmexecutor). -For more details, please see the NeMo-Run [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#) for a list of supported platforms, their corresponding executors, and configuration instructions. - -You can also forward arguments from the NeMo-Run launch script to the target script: - -```python -import nemo_run as run -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - ... - known_args, args_to_fwd = parser.parse_known_args() - train_script = run.Script(..., args=args_to_fwd) -``` - -For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b_nemo_run_script.py). - -#### Plugins - -Megatron Bridge provides several NeMo-Run plugins to simplify the usage of certain features. -These plugins can simply be added to the `run.run()` call: - -```python -import nemo_run as run -from megatron.bridge.recipes.run_plugins import NsysPlugin - -if __name__ == "__main__": - train_script = run.Script(path="/path/to/train/script.py", entrypoint="python") - executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") - - plugins = [] # plugins argument expects a list - nsys = NsysPlugin(profile_step_start=10, profile_step_end=15, ...) - plugins.append(nsys) - run.run(train_script, plugins=plugins, executor=executor) -``` - -##### Custom Argument Converters - -By default, plugins convert their configuration to Hydra-style CLI arguments when used with `run.Script` tasks. If your training script uses a different argument format (e.g., argparse), you can provide a custom converter function via the `script_args_converter_fn` parameter. - -```python -import nemo_run as run -from typing import List -from megatron.bridge.recipes.run_plugins import ( - PreemptionPlugin, - PreemptionPluginScriptArgs, -) - -# Define a custom converter for argparse-style arguments -def argparse_preemption_converter(args: PreemptionPluginScriptArgs) -> List[str]: - result = [] - if args.enable_exit_handler: - result.append("--enable-exit-handler") - if args.enable_exit_handler_for_data_loader: - result.append("--enable-exit-handler-dataloader") - return result - -if __name__ == "__main__": - train_script = run.Script(path="/path/to/train/script.py", entrypoint="python") - executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") - - # Use the plugin with the custom converter - plugin = PreemptionPlugin( - preempt_time=120, - enable_exit_handler=True, - script_args_converter_fn=argparse_preemption_converter, - ) - run.run(train_script, plugins=[plugin], executor=executor) -``` - -Each plugin provides its own corresponding dataclass (e.g., `PreemptionPluginScriptArgs`, `NsysPluginScriptArgs`) that defines the available arguments for conversion. - -See the [API reference](#bridge.recipes.run_plugins) for a list of available NeMo-Run plugins. - -### Avoiding Hangs - -When working with any scripts in Megatron Bridge, please make sure you wrap your code in an `if __name__ == "__main__":` -block. Otherwise, your code may hang unexpectedly. - -The reason for this is that Megatron Bridge uses Python's `multiprocessing` module in the backend when running a -multi-GPU job. The multiprocessing module will create new Python processes that will import the current module (your -script). If you did not add `__name__== "__main__"`, then your module will spawn new processes which import the -module and then each spawn new processes. This results in an infinite loop of process spawning. - -## Resources - -- [OmegaConf documentation](https://omegaconf.readthedocs.io/en/2.3_branch/) -- [torchrun Documentation](https://docs.pytorch.org/docs/stable/elastic/run.html) -- [PyTorch Multinode Training documentation](https://docs.pytorch.org/tutorials/intermediate/ddp_series_multinode.html) -- [NeMo-Run documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html#) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/packed-sequences.md -```md -# Packed Sequences - -Packed sequences are a fine-tuning technique that reduces padding waste by -concatenating multiple examples into one pack while preserving sequence -boundaries for attention. In Megatron Bridge, this is primarily a supervised -fine-tuning and PEFT optimization rather than a general pretraining feature. - -This page is the stable overview for what packed sequences are, when to use -them, and which constraints are durable. For operational setup, code anchors, -and verification commands, see [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md). - -## What It Is - -Fine-tuning datasets often contain examples with highly variable lengths. When -those examples are batched conventionally, many tokens in each batch are just -padding. Packed sequences reduce that waste by building longer packs from -multiple examples and carrying boundary metadata into the attention path. - -In Bridge today, there are two distinct packing paths plus long-context -enablement through context parallelism: - -| Path | Use case | Key config | -|---|---|---| -| Offline packed SFT | Text-only finetuning | `packed_sequence_specs` | -| VLM in-batch packing | VLM finetuning | `pack_sequences_in_batch=True` | -| Long-context (CP) | Pretrain / finetune at 16K-128K+ | `context_parallel_size > 1` | - -These are related but they are not the same knob. Offline packed SFT and VLM -in-batch packing solve padding waste; long-context training primarily addresses -activation memory and communication tradeoffs at larger sequence lengths. - -## When to Use It - -Packed sequences are a good fit when all of the following are true: - -- you are doing SFT, PEFT, or VLM finetuning (all three packing paths are - supported; see the path table above) -- your examples have variable lengths and padding waste is significant -- you can tolerate the micro-batch constraints of packed training - -Packed sequences are usually not the right answer when: - -- you are doing standard Megatron-style pretraining, which already concatenates - documents during sampling -- you want long-context training in general, where context parallelism is often - the main technique -- your model family or recipe explicitly opts out of packed-sequence support - -## Stable Constraints - -The durable constraints for packed sequences in Bridge are: - -- packed SFT requires `micro_batch_size == 1` -- when context parallelism is used, sequence length must satisfy the standard - CP divisibility constraints -- for fine-tuning with CP enabled, per-token loss behavior and reduction - settings matter -- CUDA-graph-friendly packed metadata requires additional padding constraints - -Model-family support is not universal. Some families and recipe paths explicitly -opt out of packed sequences or related packing modes. - -## Relationship to Long-Sequence Training - -Packed sequences and long-sequence training are often mentioned together because -both affect sequence layout and memory behavior, but they solve different -problems: - -- packed sequences mainly reduce padding waste in fine-tuning datasets -- long-sequence training mainly addresses activation memory and communication - tradeoffs at larger sequence lengths - -For long-sequence training guidance, see: - -- `docs/performance-guide.md` -- `docs/training/hybrid-context-parallel.md` - -## Practical Caveats - -The most stable caveats to remember are: - -1. Packed-sequence support is recipe- and model-family-specific. -2. Fine-tuning sequence packing should not be assumed to work with every other - training feature. -3. Packed sequences improve efficiency primarily by reducing padding waste, not - by replacing long-context parallelism or memory-planning techniques. - -## Related Docs - -- [docs/training/multi-token-prediction.md](multi-token-prediction.md) -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md) -- [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md) -- [skills/perf-techniques/sequence-packing/card.yaml](../skills/perf-techniques/sequence-packing/card.yaml) -- [skills/perf-techniques/packed-sequences-long-context/SKILL.md](../skills/perf-techniques/packed-sequences-long-context/SKILL.md) -- [skills/perf-techniques/packed-sequences-long-context/card.yaml](../skills/perf-techniques/packed-sequences-long-context/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/nemotron3-super.md -```md -# Nemotron 3 Super -[Nemotron 3 Super](https://huggingface.co/collections/nvidia/nvidia-nemotron-v3)is a large language model (LLM) trained by NVIDIA, designed to deliver strong agentic, reasoning, and conversational capabilities. It is employs a hybrid **Latent Mixture-of-Experts (LatentMoE)** architecture, utilizing interleaved Mamba-2 and MoE layers, along with select Attention layers. Distinct from the Nano model, the Super model incorporates **Multi-Token Prediction (MTP)** layers for faster text generation and improved quality, and it is trained using **NVFP4** quantization to maximize compute efficiency. The model has **12B active parameters** and **120B parameters in total**. - -NeMo Megatron Bridge supports pretraining, full parameters finetuning, and LoRA finetuning this model. The finetuned model can be converted back to the 🤗 Hugging Face format for downstream evaluation. - -```{important} -Please use the custom container `nvcr.io/nvidia/nemo:26.02.nemotron_3_super` when working with this model. - -Run all commands from `/opt/Megatron-Bridge` (e.g. `docker run -w /opt/Megatron-Bridge ...`) -``` - -## Getting the Latest Code - -For the best experience, it is recommended to use the latest code from the `super-v3` branch. There are two ways to do this: - -### Option 1: Update the Code Inside the Container - -Launch the container and update the code in-place: - -```bash -# Pull the latest changes from the super-v3 branch -cd /opt/megatron -git pull origin super-v3 -``` - -### Option 2: Mount the Repo from Host - -This approach lets you work with the code on your host machine and mount it into the container at runtime. - -**Step 1 — Pull the latest `super-v3` branch on the host:** - -```bash -git checkout super-v3 && git pull origin super-v3 -``` - -**Step 2 — Mount the repo when launching the container:** - -```bash -MEGATRON_BRIDGE_PATH=/path/to/Megatron-Bridge # set this to your local clone - -docker run --rm -it \ - -v $MEGATRON_BRIDGE_PATH:/opt/Megatron-Bridge \ - -w /opt/Megatron-Bridge \ - nvcr.io/nvidia/nemo:26.02.nemotron_3_super \ - bash -``` - ---- - -## Conversion with 🤗 Hugging Face - -### Import HF → Megatron -To import the HF model to your desired `$MEGATRON_MODEL_PATH`, use the distributed -conversion script because this model uses expert parallelism. The single-process -`examples/conversion/convert_checkpoints.py` script is limited to single-GPU conversion -without model parallelism. - -```bash -HF_MODEL=/path/to/hf/model -MEGATRON_PATH=/path/to/output/megatron/ckpt - -torchrun --nproc-per-node=8 examples/conversion/convert_checkpoints_multi_gpu.py import \ ---hf-model $HF_MODEL \ ---megatron-path $MEGATRON_PATH \ ---tp 1 \ ---ep 8 -``` - -Notes: -- The default parallelism is TP=1, EP=8 (Expert Parallel) -- Adjust `--nproc-per-node` based on your available GPUs - -### Export Megatron → HF -```bash -HF_MODEL=/path/to/hf/model -MEGATRON_PATH=/path/to/trained/megatron/ckpt -OUTPUT_PATH=/path/to/output/hf/ckpt - -torchrun --nproc-per-node=8 examples/conversion/convert_checkpoints_multi_gpu.py export \ ---hf-model $HF_MODEL \ ---megatron-path $MEGATRON_PATH \ ---hf-path $OUTPUT_PATH \ ---tp 1 \ ---ep 8 -``` - -### Roundtrip Testing -To verify the correctness of import/export conversions: - -```bash -HF_MODEL=/path/to/hf/model -MEGATRON_PATH=/path/to/megatron/ckpt - -torchrun --nproc-per-node=8 examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ ---hf-model-id $HF_MODEL \ ---megatron-load-path $MEGATRON_PATH \ ---tp 1 \ ---ep 8 \ ---trust-remote-code -``` - -### Compare HF and Megatron Outputs -To compare outputs between HF and Megatron models: - -```bash -HF_MODEL=/path/to/hf/model -MEGATRON_PATH=/path/to/megatron/ckpt - -torchrun --nproc-per-node=8 examples/conversion/compare_hf_and_megatron/compare.py \ ---hf_model_path $HF_MODEL \ ---megatron_model_path $MEGATRON_PATH \ ---prompt "Hello who are " \ ---tp 8 \ ---ep 8 \ ---trust_remote_code -``` - -## Pretraining Examples - -### Pretraining with Real Data -```bash -BLEND_PATH=/path/to/dataset/blend.json -CHECKPOINT_DIR=/path/to/checkpoints - -torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_super.py \ ---per-split-data-args-path=${BLEND_PATH} \ -logger.wandb_project=your_project \ -logger.wandb_entity=nvidia \ -logger.log_interval=5 \ -checkpoint.load=${CHECKPOINT_DIR} \ -checkpoint.save=${CHECKPOINT_DIR} \ -checkpoint.save_interval=100 \ -train.global_batch_size=8 \ -train.micro_batch_size=1 \ -train.train_iters=1280 \ -scheduler.lr_warmup_iters=128 \ -scheduler.lr_decay_iters=1152 \ -scheduler.lr_wsd_decay_iters=1152 \ -model.tensor_model_parallel_size=4 \ -model.context_parallel_size=1 \ -model.expert_model_parallel_size=64 \ -model.sequence_parallel=True -``` - -Notes: -- **GPU Requirements**: Requires B200 GPUs for NVFP4 support. Minimum of 8 nodes (64 GPUs) required -- The default parallelism settings are TP=4, EP=64, PP=1, CP=1 with sequence parallel enabled -- Expert parallelism (EP) is set to 64 for the MoE architecture -- Adjust batch sizes and iteration counts based on your training requirements -- Make sure to set up WandB credentials if using WandB logging - -### Pretraining with Mock Data -For quick testing without a dataset: - -```bash -CHECKPOINT_DIR=/path/to/checkpoints - -torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_super.py \ -logger.wandb_project=your_project \ -logger.wandb_entity=nvidia \ -checkpoint.load=${CHECKPOINT_DIR} \ -checkpoint.save=${CHECKPOINT_DIR} \ -checkpoint.save_interval=100 \ -train.global_batch_size=128 \ -train.train_iters=100 \ -scheduler.lr_warmup_iters=10 \ -model.hybrid_override_pattern="MEME*ME" \ -model.num_layers=7 -``` - -Notes: -- If `BLEND_PATH` is not specified, mock dataset will be used -- The `hybrid_override_pattern` can be used to customize the MoE layer pattern -- Useful for debugging and testing the training pipeline - - -## Finetuning Recipes - -### Full Parameter Fine-Tuning -```bash -MEGATRON_PATH=/path/to/pretrained/megatron/ckpt -CHECKPOINT_DIR=/path/to/finetuned/checkpoints - -torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_super.py \ -logger.wandb_project=your_project \ -logger.wandb_entity=nvidia \ -logger.log_interval=5 \ -checkpoint.load=${CHECKPOINT_DIR} \ -checkpoint.save=${CHECKPOINT_DIR} \ -checkpoint.save_interval=50 \ -train.global_batch_size=16 \ -train.train_iters=200 \ -scheduler.lr_warmup_iters=10 \ -model.tensor_model_parallel_size=4 \ -model.sequence_parallel=True \ -checkpoint.pretrained_checkpoint=$MEGATRON_PATH -``` - -Notes: -- Default parallelism TP=4, EP=8, PP=1, CP=1 with sequence parallel enabled -- By default, the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad) dataset is used. -- Fine-tuning requires a pretrained Megatron checkpoint, which can be obtained from the "Import HF → Megatron" section above -- Adjust `global_batch_size` and parallelism settings based on your GPU memory and requirements - - -### LoRA Fine-Tuning -To enable LoRA fine-tuning, pass `--peft lora` to the script: - -```bash -MEGATRON_PATH=/path/to/pretrained/megatron/ckpt -CHECKPOINT_DIR=/path/to/lora/checkpoints - -torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_super.py \ ---peft lora \ -logger.wandb_project=your_project \ -logger.wandb_entity=nvidia \ -logger.log_interval=5 \ -checkpoint.load=${CHECKPOINT_DIR} \ -checkpoint.save=${CHECKPOINT_DIR} \ -checkpoint.save_interval=100 \ -train.global_batch_size=4 \ -train.train_iters=200 \ -model.tensor_model_parallel_size=4 \ -model.context_parallel_size=2 \ -model.sequence_parallel=True \ -scheduler.lr_warmup_iters=30 \ -checkpoint.pretrained_checkpoint=$MEGATRON_PATH -``` - -Notes: -- By default, the target modules are linear layers `["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"]` in the model -- LoRA fine-tuning uses less memory and can work with smaller batch sizes -- Consider using Context Parallel (CP) for longer sequences - - -## Quantization (PTQ and QAT) - -```{important} -Quantization support requires the latest code from the `super-v3` branch. See [Getting the Latest Code](#getting-the-latest-code) for instructions. -``` - -Nemotron 3 Super supports four quantization configurations: - -| Config Name | Format | Description | -|---|---|---| -| `mamba_moe_fp8_aggressive` | FP8 | Aggressive FP8 quantization for Mamba-MoE | -| `mamba_moe_fp8_conservative` | FP8 | Conservative FP8 quantization for Mamba-MoE | -| `mamba_moe_nvfp4_aggressive` | NVFP4 | Aggressive NVFP4 quantization for Mamba-MoE | -| `mamba_moe_nvfp4_conservative` | NVFP4 | Conservative NVFP4 quantization for Mamba-MoE | - -Pass the desired config name via `--export-quant-cfg` to `quantize.py`. - -### Quantize -```bash -export HF_MODEL=/path/to/hf/model -export MEGATRON_SAVE_PATH=/path/to/quantized/megatron/ckpt - -torchrun --nproc_per_node=8 examples/quantization/quantize.py \ - --hf-model-id $HF_MODEL \ - --export-quant-cfg mamba_moe_nvfp4_conservative \ - --megatron-save-path $MEGATRON_SAVE_PATH \ - --pp 1 \ - --tp 8 \ - --ep 8 \ - --trust-remote-code -``` - -### Verify with PTQ Generate -```bash -torchrun --nproc_per_node=8 examples/quantization/ptq_generate.py \ - --hf-model-id $HF_MODEL \ - --megatron-load-path $MEGATRON_SAVE_PATH \ - --pp 1 \ - --tp 8 \ - --ep 8 \ - --trust-remote-code -``` - -Notes: -- For multi-node setups (e.g. 2 nodes with 8× H100), increase `--pp` accordingly (e.g. `--pp 2`) and use a job scheduler like SLURM to launch across nodes. - -### Export Quantized Megatron Checkpoint → HF - -After quantization, export the Megatron checkpoint back to Hugging Face format: - -```bash -HF_MODEL=/path/to/hf/model -MEGATRON_LOAD_PATH=/path/to/quantized/megatron/ckpt -EXPORT_DIR=/path/to/output/hf/ckpt - -torchrun --nproc_per_node=8 examples/quantization/export.py \ - --hf-model-id $HF_MODEL \ - --megatron-load-path $MEGATRON_LOAD_PATH \ - --export-dir $EXPORT_DIR \ - --pp 8 \ - --dtype bfloat16 \ - --trust-remote-code -``` - -### Quantization-Aware Training (QAT) - -After quantization, further improve model quality with QAT by continuing training from a quantized Megatron checkpoint. - -```bash -MEGATRON_PATH=/path/to/quantized/megatron/ckpt -CHECKPOINT_DIR=/path/to/qat/checkpoints - -torchrun --nproc-per-node=8 examples/models/nemotron_3/qat_nemotron_3_super.py \ ---megatron-load-path=${MEGATRON_PATH} \ ---seq-length=8192 \ ---packed-sequence \ -logger.wandb_project=your_project \ -logger.wandb_entity=nvidia \ -logger.log_interval=5 \ -checkpoint.load=${CHECKPOINT_DIR} \ -checkpoint.save=${CHECKPOINT_DIR} \ -checkpoint.save_interval=50 \ -train.global_batch_size=16 \ -train.train_iters=200 \ -scheduler.lr_warmup_iters=10 \ -model.tensor_model_parallel_size=4 \ -model.sequence_parallel=True -``` -``` - -File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/launch_with_sbatch.sh -```sh -#!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#SBATCH --job-name=megatron-bridge-train -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 -#SBATCH --time=04:00:00 -#SBATCH --partition=gpu -#SBATCH --account=my_account -#SBATCH --output=logs/train_%j.out -#SBATCH --error=logs/train_%j.err -#SBATCH --exclusive - -# ============================================================================== -# Direct Slurm Launch with sbatch (Alternative to NeMo-Run) -# -# This script demonstrates how to launch generic training scripts directly -# using sbatch without NeMo-Run. This is useful for traditional HPC workflows. -# -# Usage: -# 1. Modify the #SBATCH directives above for your cluster -# 2. Set the configuration variables below -# 3. Submit: sbatch launch_with_sbatch.sh -# -# For NeMo-Run based launching (recommended for remote management), see -# launch_with_nemo_run.py -# ============================================================================== - -# ============================================================================== -# CONFIGURATION - Modify these for your setup -# ============================================================================== - -# Training script to run -TRAINING_SCRIPT="run_recipe.py" -# Options: -# TRAINING_SCRIPT="run_recipe.py" -# TRAINING_SCRIPT="pretrain_vlm.py" # For VLM models -# TRAINING_SCRIPT="finetune_vlm.py" # For VLM finetuning - -# Recipe name (must match a recipe function from megatron.bridge.recipes) -RECIPE="llama32_1b_pretrain_config" -# Examples: -# RECIPE="gemma3_1b_pretrain_config" -# RECIPE="qwen3_8b_sft_config" -# RECIPE="llama3_8b_pretrain_config" -# RECIPE="qwen25_vl_pretrain_config" # For VLM models - -# Forward step type (gpt or vlm) -STEP_TYPE="gpt" - -# Optional: CLI overrides (Hydra-style dot notation) -CLI_OVERRIDES="" -# CLI_OVERRIDES="train.train_iters=1000 train.global_batch_size=512 optimizer.lr=0.0002" - -# Container image (required) -CONTAINER_IMAGE="" -# CONTAINER_IMAGE="/path/to/container.sqsh" - -# Container mounts (optional, space-separated) -CONTAINER_MOUNTS="" -# CONTAINER_MOUNTS="/data:/data /model:/model" - -# ============================================================================== -# Environment Setup -# ============================================================================== - -# Set common environment variables -export TORCH_NCCL_AVOID_RECORD_STREAMS=1 -export NCCL_NVLS_ENABLE=0 - -# Authentication tokens (uncomment and set your tokens) -# export HF_TOKEN="hf_your_token_here" -# export WANDB_API_KEY="your_wandb_key_here" - -# Optional: Uncomment if needed -# export CUDA_DEVICE_MAX_CONNECTIONS=1 -# export NCCL_DEBUG=INFO - -# ============================================================================== -# Job Execution -# ============================================================================== - -echo "======================================" -echo "Megatron Bridge Training Job" -echo "======================================" -echo "Job ID: $SLURM_JOB_ID" -echo "Nodes: $SLURM_JOB_NUM_NODES" -echo "GPUs per node: $SLURM_GPUS_PER_NODE" -echo "Script: $TRAINING_SCRIPT" -echo "Recipe: $RECIPE" -if [ -n "$HF_TOKEN" ]; then - echo "HF_TOKEN: Set" -fi -if [ -n "$WANDB_API_KEY" ]; then - echo "WANDB_API_KEY: Set" -fi -echo "======================================" - -# Determine script path -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SCRIPT_PATH="${SCRIPT_DIR}/${TRAINING_SCRIPT}" - -if [ ! -f "$SCRIPT_PATH" ]; then - echo "ERROR: Training script not found: $SCRIPT_PATH" - exit 1 -fi - -# Build torchrun command -CMD="torchrun" -CMD="$CMD --nproc_per_node=$SLURM_GPUS_PER_NODE" -CMD="$CMD --nnodes=$SLURM_JOB_NUM_NODES" -CMD="$CMD --node_rank=\$SLURM_PROCID" -CMD="$CMD --master_addr=\$(scontrol show hostname \$SLURM_NODELIST | head -n1)" -CMD="$CMD --master_port=29500" -CMD="$CMD $SCRIPT_PATH" -CMD="$CMD --recipe $RECIPE" -CMD="$CMD --step $STEP_TYPE" - -# Add CLI overrides if specified -if [ -n "$CLI_OVERRIDES" ]; then - CMD="$CMD $CLI_OVERRIDES" -fi - -echo "Executing: $CMD" -echo "======================================" - -# Require container image -if [ -z "$CONTAINER_IMAGE" ]; then - echo "ERROR: CONTAINER_IMAGE must be set. Please use a valid container image." - exit 1 -fi - -# Build srun command (always containerized) -SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE" - -# Add container mounts -if [ -n "$CONTAINER_MOUNTS" ]; then - for mount in $CONTAINER_MOUNTS; do - SRUN_CMD="$SRUN_CMD --container-mounts=$mount" - done -fi - -$SRUN_CMD bash -c "$CMD" - -echo "======================================" -echo "Job completed" -echo "======================================" - - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/canonical_lora.png -```png -[Binary file] -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/optimizer-scheduler.md -```md -# Optimizer and Scheduler Configuration - -The optimizer and scheduler configurations control optimization algorithms, learning rate schedules, and weight decay strategies. - -## OptimizerConfig (from Megatron Core) - -The `OptimizerConfig` contains all parameters for the optimization algorithm and comes directly from Megatron Core. Key parameters include: - -| Parameter | Type | Description | -|-----------|------|-------------| -| `optimizer` | `str` | Optimizer type ("adam", "sgd", etc.) | -| `lr` | `float` | Base learning rate | -| `min_lr` | `float` | Minimum learning rate for decay schedules | -| `weight_decay` | `float` | L2 regularization coefficient | -| `adam_beta1` | `float` | Adam optimizer beta1 parameter | -| `adam_beta2` | `float` | Adam optimizer beta2 parameter | -| `adam_eps` | `float` | Adam optimizer epsilon parameter | -| `clip_grad` | `float` | Gradient clipping threshold | -| `use_distributed_optimizer` | `bool` | Enable distributed optimizer for memory efficiency | -| `overlap_grad_reduce` | `bool` | Overlap gradient reduction with computation | -| `overlap_param_gather` | `bool` | Overlap parameter gathering with computation | -| `bf16` | `bool` | Use BF16 precision for training | -| `fp16` | `bool` | Use FP16 precision for training | - -## SchedulerConfig - -The `SchedulerConfig` controls learning rate scheduling and weight decay progression throughout training. - -### Learning Rate Scheduling - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `lr_decay_style` | `Literal["constant", "linear", "cosine", "inverse-square-root", "WSD"]` | `"linear"` | Learning rate decay function | -| `lr_decay_iters` | `Optional[int]` | `None` | Iterations to decay LR over (defaults to `train_iters`). Use for iteration-based training. | -| `lr_decay_samples` | `Optional[int]` | `None` | Samples to decay LR over (defaults to `train_samples`). Use for sample-based training. | -| `lr_warmup_iters` | `int` | `0` | Iterations to linearly warmup learning rate. Use for iteration-based training. | -| `lr_warmup_samples` | `int` | `0` | Samples to linearly warmup learning rate. Use for sample-based training. | -| `lr_warmup_fraction` | `Optional[float]` | `None` | Fraction of decay iterations/samples to use for warmup (works with both modes) | -| `lr_warmup_init` | `float` | `0.0` | Initial learning rate for warmup phase | - -**Scheduler Mode Selection** - -The scheduler supports two modes that must align with your training configuration: - -1. **Iteration-based scheduling**: Use `lr_decay_iters` and `lr_warmup_iters` with `train_iters`. -2. **Sample-based scheduling**: Use `lr_decay_samples` and `lr_warmup_samples` with `train_samples`. - -**Important constraints** -- Cannot mix iteration-based and sample-based scheduler parameters. -- Your scheduler mode must match your training mode (iteration-based vs sample-based). -- `lr_warmup_fraction` is compatible with both modes but cannot be used with explicit warmup iterations/samples. - -### WSD (Warmup-Stable-Decay) Scheduling - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `lr_wsd_decay_style` | `Literal["exponential", "linear", "cosine"]` | `"exponential"` | Decay style for WSD annealing phase | -| `lr_wsd_decay_iters` | `Optional[int]` | `None` | Iterations for WSD annealing phase. Use for iteration-based training. | -| `lr_wsd_decay_samples` | `Optional[int]` | `None` | Samples for WSD annealing phase. Use for sample-based training. | - -### Weight Decay Scheduling - -Parameters for controlling the progression of weight decay during training, including start and end values and the scheduling strategy: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `start_weight_decay` | `Optional[float]` | `None` | Initial weight decay coefficient | -| `end_weight_decay` | `Optional[float]` | `None` | Final weight decay coefficient | -| `weight_decay_incr_style` | `Literal["constant", "linear", "cosine"]` | `"constant"` | Weight decay progression style | - -### Checkpoint Integration - -Parameters for managing how scheduler settings are applied during checkpoint loading, allowing control over whether to prioritize config values or restore from saved state: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `override_opt_param_scheduler` | `bool` | `False` | Reset scheduler values from config, ignoring checkpoint | -| `use_checkpoint_opt_param_scheduler` | `bool` | `False` | Use scheduler values from checkpoint, ignoring config | - -### Computed Fields - -These fields are automatically calculated during configuration validation and help align training schedules with the configured batch size and iteration counts: - -| Field | Description | -|-------|-------------| -| `lr_warmup_steps` | Total steps for warmup (calculated from iterations and batch size) | -| `lr_decay_steps` | Total steps for decay (calculated from iterations and batch size) | -| `wd_incr_steps` | Total steps for weight decay progression | -| `wsd_decay_steps` | Total steps for WSD annealing phase | - -## Learning Rate Schedules - -The following scheduling strategies define how the learning rate evolves during training, each suited to different convergence behaviors and model types: -| Schedule Type | Description | -|-------------------------|-----------------------------------------------------------------------------| -| **Constant** | Learning rate remains fixed throughout training. | -| **Linear** | Learning rate decreases linearly from the base LR to the minimum LR. | -| **Cosine** | Learning rate follows a cosine decay curve from base LR to minimum LR. | -| **Inverse Square Root** | Learning rate decays proportionally to the inverse square root of the step. | - -## WSD (Warmup-Stable-Decay) -The WSD schedule divides learning rate progression into three distinct phases, offering fine-grained control over early ramp-up, mid-training stability, and final decay: -| Phase | Description | -|-----------|----------------------------------------------------------| -| **Warmup** | Learning rate increases linearly from initial value to base LR. | -| **Stable** | Learning rate remains constant at base LR. | -| **Decay** | Learning rate decays to minimum LR using a specified style (e.g., exponential, linear, cosine). | - -## Weight Decay Scheduling - -These scheduling options control how the weight decay coefficient changes over time, allowing for regularization strategies that adapt to different training phases: -| Schedule Type | Description | -|---------------|-----------------------------------------------------------------------------| -| **Constant** | Fixed weight decay throughout training. | -| **Linear** | Linear progression from start to end weight decay. | -| **Cosine** | Cosine progression from start to end weight decay. | -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/nano/slurm_pretrain.sh -```sh -#!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ============================================================================== -# Nemotron 3 Nano Pretraining -# -# Nemotron 3 Nano is a 30B parameter model with A3B (Active 3 Billion) architecture -# Supports multiple parallelism configs: each "TP,PP,EP,CP,SP" runs sequentially. -# -# Usage: -# 1. Modify the #SBATCH directives below for your cluster -# 2. Set CONTAINER_IMAGE to your container path -# 3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled) -# 4. Submit: sbatch slurm_pretrain.sh -# ============================================================================== - -#SBATCH --job-name=nemotron3-pretrain -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 -#SBATCH --time=24:00:00 -#SBATCH --partition=gpu -#SBATCH --account=my_account -#SBATCH --output=logs/nemotron3_pretrain_%j.out -#SBATCH --error=logs/nemotron3_pretrain_%j.err -#SBATCH --exclusive - -# ============================================================================== -# CONFIGURATION -# ============================================================================== - -# Workspace directory for checkpoints and results -WORKSPACE=${WORKSPACE:-/workspace} - -# Model and training configurations -MODEL_NAME=nemotron_3_nano -DATASET_NAME=mock -SEQ_LENGTH=512 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=32 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 -LR_WARMUP_ITERS=5 -LOG_INTERVAL=1 -WANDB_PROJECT=megatron-bridge-${DATASET_NAME} - -# Parallelism configs: "TP,PP,EP,CP,SP" per entry -PARALLELISM_CONFIGS=("4,1,8,1,True" "2,2,8,1,True" "2,1,8,2,True") - -# Container image (required) -CONTAINER_IMAGE="" -# CONTAINER_IMAGE="/path/to/container.sqsh" - -# Container mounts (optional, space-separated) -CONTAINER_MOUNTS="" -# CONTAINER_MOUNTS="/data:/data /workspace:/workspace" - -# ============================================================================== -# Environment Setup -# ============================================================================== - -# NCCL optimizations for large-scale training -export TORCH_NCCL_AVOID_RECORD_STREAMS=1 -export NCCL_NVLS_ENABLE=0 - -# UV cache on shared filesystem (recommended for multi-node setups) -# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync -# export UV_CACHE_DIR="/path/to/shared/uv_cache" - -# HuggingFace cache directory (recommended for shared filesystem) -# export HF_HOME="/path/to/shared/HF_HOME" - -# Authentication tokens (set these for your environment) -# export HF_TOKEN= -# export WANDB_API_KEY= - -# ============================================================================== -# Job Execution -# ============================================================================== - -echo "======================================" -echo "Nemotron 3 Nano Pretraining Job" -echo "======================================" -echo "Job ID: $SLURM_JOB_ID" -echo "Nodes: $SLURM_JOB_NUM_NODES" -echo "GPUs per node: $SLURM_GPUS_PER_NODE" -echo "Model: $MODEL_NAME" -echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}" -echo "======================================" - -# Create logs directory if it doesn't exist -mkdir -p logs - -# Require container image -if [ -z "$CONTAINER_IMAGE" ]; then - echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image." - exit 1 -fi - -# Build srun command (shared across configs) -SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE" -if [ -n "$CONTAINER_MOUNTS" ]; then - SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS" -fi -echo "SRUN base: $SRUN_CMD" -echo "======================================" - -# Run each parallelism config in sequence -CONFIG_INDEX=0 -for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do - IFS=',' read -r TP PP EP CP SP <<< "$CONFIG" - CONFIG_INDEX=$((CONFIG_INDEX + 1)) - echo "" - echo "======================================" - echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP" - echo "======================================" - - # Build CLI overrides for this config - CLI_OVERRIDES="\ - model.seq_length=$SEQ_LENGTH \ - train.train_iters=$TRAIN_ITERS \ - train.global_batch_size=$GLOBAL_BATCH_SIZE \ - train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ - scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \ - logger.log_interval=$LOG_INTERVAL \ - logger.wandb_project=$WANDB_PROJECT \ - logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \ - dataset.sequence_length=$SEQ_LENGTH \ - model.tensor_model_parallel_size=$TP \ - model.pipeline_model_parallel_size=$PP \ - model.expert_model_parallel_size=$EP \ - model.sequence_parallel=$SP \ - model.context_parallel_size=$CP" - - CMD="uv run --no-sync python scripts/training/run_recipe.py" - CMD="$CMD --recipe ${MODEL_NAME}_pretrain_config" - CMD="$CMD $CLI_OVERRIDES" - - echo "Executing command..." - echo $CMD - echo "======================================" - - $SRUN_CMD bash -c "$CMD" - RUN_EXIT=$? - if [ $RUN_EXIT -ne 0 ]; then - echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT" - exit $RUN_EXIT - fi -done - -echo "======================================" -echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)" -echo "======================================" - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/activation-recomputation-example-2.jpg -```jpg -[Binary file] -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/gpt_oss/README.md -```md -# GPT-OSS Examples - -This directory contains example scripts for GPT-OSS 20B language models. - -For model introduction and architecture details, see the GPT-OSS documentation. - -## Workspace Configuration - -All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it: - -```bash -export WORKSPACE=/your/custom/path -``` - -Directory structure: -- `${WORKSPACE}/models/` - Converted checkpoints -- `${WORKSPACE}/results/` - Training outputs and experiment results - -## Checkpoint Conversion - -See the [conversion.sh](conversion.sh) script for checkpoint conversion examples. - -- **Import**: Use `openai/gpt-oss-20b` as the source Hugging Face model. -- **Export**: Use `unsloth/gpt-oss-20b-BF16` as the reference HF model for export because the exported Megatron checkpoint is unquantized (bf16), which matches that repo's format. - -### Import HF → Megatron - -To import the HF model to your desired Megatron path: - -```bash -python examples/conversion/convert_checkpoints.py import \ - --hf-model openai/gpt-oss-20b \ - --megatron-path ${WORKSPACE}/models/gpt-oss-20b \ - --trust-remote-code -``` - -### Export Megatron → HF - -The export uses `unsloth/gpt-oss-20b-BF16` as the reference so the saved HF checkpoint matches that unquantized format: - -```bash -python examples/conversion/convert_checkpoints.py export \ - --hf-model unsloth/gpt-oss-20b-BF16 \ - --megatron-path ${WORKSPACE}/models/gpt-oss-20b/iter_0000000 \ - --hf-path ${WORKSPACE}/models/gpt-oss-20b-hf-export -``` - -### Round-trip Validation - -Multi-GPU round-trip validation between formats: - -```bash -python -m torch.distributed.run --nproc_per_node=8 \ - examples/conversion/hf_megatron_roundtrip_multi_gpu.py \ - --hf-model-id unsloth/gpt-oss-20b-BF16 \ - --megatron-load-path ${WORKSPACE}/models/gpt-oss-20b/iter_0000000 \ - --tp 2 --pp 2 \ - --trust-remote-code -``` - -## Training Recipes - -- See: [bridge.recipes.gpt_oss](../../../src/megatron/bridge/recipes/gpt_oss/gpt_oss.py) -- Available recipes: - - `gpt_oss_20b_pretrain_config`: Pretraining configuration for 20B - - `gpt_oss_20b_pretrain_fp8_current_scaling_config`: Pretraining configuration for 20B with Hopper FP8 current scaling - - `gpt_oss_20b_sft_config`: Full SFT configuration for 20B - - `gpt_oss_20b_sft_fp8_current_scaling_config`: Full SFT configuration for 20B with Hopper FP8 current scaling - - `gpt_oss_20b_peft_config`: LoRA PEFT configuration for 20B - - `gpt_oss_20b_peft_fp8_current_scaling_config`: LoRA PEFT configuration for 20B with Hopper FP8 current scaling - - `gpt_oss_20b_pretrain_mxfp8_config`: Pretraining configuration for 20B with Blackwell MXFP8 - - `gpt_oss_20b_sft_mxfp8_config`: Full SFT configuration for 20B with Blackwell MXFP8 - - `gpt_oss_20b_peft_mxfp8_config`: LoRA PEFT configuration for 20B with Blackwell MXFP8 - - `gpt_oss_120b_pretrain_config`: Pretraining configuration for 120B - - `gpt_oss_120b_sft_config`: Full SFT configuration for 120B - - `gpt_oss_120b_peft_config`: LoRA PEFT configuration for 120B - -Before training, ensure the following are configured: -1. **Container Image**: Set `CONTAINER_IMAGE` in the SLURM scripts to your container path -2. **Container Mounts**: (optional) Set `CONTAINER_MOUNTS` for data and workspace directories -3. **Environment Variables**: - - `HF_TOKEN`: to download models from HF Hub (if required) - - `HF_HOME`: (optional) to avoid re-downloading models and datasets - - `WANDB_API_KEY`: (optional) to enable WandB logging - -All training scripts use SLURM for containerized multi-node training. - -### FP8 Training (Hopper GPUs) - -The FP8 current scaling recipes enable mixed-precision training with FP8 on Hopper GPUs. To use an FP8 recipe, uncomment the FP8 `RECIPE_NAME` line in the corresponding SLURM script: - -- [slurm_pretrain.sh](slurm_pretrain.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_pretrain_fp8_current_scaling_config"` -- [slurm_sft.sh](slurm_sft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_sft_fp8_current_scaling_config"` -- [slurm_peft.sh](slurm_peft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_peft_fp8_current_scaling_config"` - -### MXFP8 Training (Blackwell GPUs) - -MXFP8 (`bf16_with_mxfp8_mixed`) enables mixed-precision training on Blackwell GPUs. To use an MXFP8 recipe, uncomment the MXFP8 `RECIPE_NAME` line in the corresponding SLURM script: - -- [slurm_pretrain.sh](slurm_pretrain.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_pretrain_mxfp8_config"` -- [slurm_sft.sh](slurm_sft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_sft_mxfp8_config"` -- [slurm_peft.sh](slurm_peft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_peft_mxfp8_config"` - -> **Note**: For GB200 nodes (4 GPUs/node), also update `--gpus-per-node` and `--ntasks-per-node` to 4 in the SBATCH directives. - -### Pretrain - -Pretrain uses the **DCLM** dataset by default when `DCLM_DATA_DIR` and `DCLM_CACHE` are set (see [slurm_pretrain.sh](slurm_pretrain.sh)). A single random DCLM shard was used for testing. - -To use your own preprocessed DCLM data, set the dataset config as follows (e.g. in the recipe or via overrides): - -```python -cfg.dataset.blend = [ - [f"/path/to/dclm/preprocessed/dclm_{i:02d}_text_document" for i in range(1, 11)], - None, -] -cfg.dataset.split = "9999,8,2" -cfg.dataset.path_to_cache = "/path/to/cache" -``` - -Preprocess your data using the [DCLM data preprocessing tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/data/dclm). - -### Supervised Fine-Tuning (SFT) - -See the [slurm_sft.sh](slurm_sft.sh) script for full parameter fine-tuning. The recipe uses sequence packing by default. - -### Parameter-Efficient Fine-Tuning (PEFT) with LoRA - -See the [slurm_peft.sh](slurm_peft.sh) script for LoRA fine-tuning. The recipe uses sequence packing by default. - -### Expected Training Dynamics -We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/xs3rmk4t) for the expected loss curves and grad norms. - -## Inference - -See [inference.sh](inference.sh) for text generation with: -- Hugging Face checkpoint (`unsloth/gpt-oss-20b-BF16`) -- Imported Megatron checkpoint (after [conversion.sh](conversion.sh) import) -- Exported HF checkpoint (after conversion export) -- **SFT (finetuned) checkpoint**: set `SFT_CHECKPOINT` to your [slurm_sft.sh](slurm_sft.sh) result dir and run: - -```bash -uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_text.py \ - --hf_model_path unsloth/gpt-oss-20b-BF16 \ - --megatron_model_path ${WORKSPACE}/results/gpt_oss_20b_finetune_tp2_pp2_ep4_spTrue_cp1 \ - --prompt "Hello, how are you?" \ - --max_new_tokens 64 \ - --tp 2 --pp 2 --ep 2 --etp 1 \ - --trust-remote-code -``` - -TP×PP×EP must equal `--nproc_per_node`. Adjust parallelism to match your SFT run. - -## Evaluation - -Coming soon. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/utils/omegaconf_utils.py -```py -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utilities for working with OmegaConf and dataclass configurations.""" - -import dataclasses -import functools -import inspect -import logging -from enum import Enum -from pathlib import Path -from typing import Any, Dict, List, Tuple, TypeVar - -import torch -from hydra._internal.config_loader_impl import ConfigLoaderImpl -from hydra.core.override_parser.overrides_parser import OverridesParser -from omegaconf import DictConfig, OmegaConf - -# Re-export so existing callers (e.g. transformer_config.py) keep working. -from megatron.bridge.utils.activation_map import callable_to_str, str_to_callable # noqa: F401 - - -logger = logging.getLogger(__name__) - -DataclassInstance = TypeVar("DataclassInstance") - -# Sentinel object to distinguish between "exclude this field" and "field is legitimately None" -_EXCLUDE_FIELD = object() - -# Fields whose callables should be serialized as strings (not excluded) -_SERIALIZABLE_CALLABLE_FIELDS: frozenset[str] = frozenset({"activation_func"}) - - -def create_omegaconf_dict_config(config_container: Any) -> Tuple[DictConfig, Dict[str, Any]]: - """Create OmegaConf while tracking excluded fields for later restoration. - - This function combines the conversion to OmegaConf with tracking of excluded - callable fields, allowing them to be restored after override processing. - - Args: - config_container: The dataclass instance to convert - - Returns: - Tuple of (OmegaConf DictConfig, excluded fields dictionary) - - Raises: - ValueError: If the conversion fails - """ - logger.debug("Starting safe OmegaConf conversion with callable preservation...") - - # Track all callable fields that will be excluded - excluded_callables = _track_excluded_fields(config_container, "root") - logger.debug(f"Found {len(excluded_callables)} callable fields to preserve") - - # Convert to OmegaConf (excluding callables) - base_dict = _dataclass_to_omegaconf_dict(config_container, "root") - - if base_dict is _EXCLUDE_FIELD: - raise ValueError("Root configuration object was excluded (likely a callable)") - - # Verify no callables remain - if not _verify_no_callables(base_dict, "root"): - raise ValueError("Callable objects found in converted dictionary") - - # Create OmegaConf - omega_conf = OmegaConf.create(base_dict) - - return omega_conf, excluded_callables - - -def apply_overrides( - config_obj: DataclassInstance, overrides_dict: Dict[str, Any], excluded_fields: Dict[str, Any] -) -> None: - """Apply overrides while preserving excluded callable fields. - - This function first applies the overrides using the standard recursive approach, - then restores the callable fields that were excluded during OmegaConf conversion. - - Args: - config_obj: The dataclass instance to modify - overrides_dict: Dictionary of override values to apply - excluded_fields: Dictionary of excluded callable fields to restore - """ - # Apply normal overrides - _apply_overrides(config_obj, overrides_dict) - - # Restore excluded fields - _restore_excluded_fields(config_obj, excluded_fields) - - logger.debug("Configuration updated with overrides and excluded fields preserved") - - -def process_config_with_overrides( - config: DataclassInstance, - config_filepath: str | None = None, - cli_overrides: list[str] | None = None, -) -> DataclassInstance: - """Process a configuration object with optional YAML file and CLI overrides. - - This function provides a unified way to: - 1. Convert the config to OmegaConf while preserving callable fields - 2. Merge an optional YAML configuration file - 3. Apply optional CLI overrides using Hydra syntax - 4. Apply the final configuration back to the original object - - Args: - config: The dataclass configuration instance to process - config_filepath: Optional path to a YAML config file to merge - cli_overrides: Optional list of Hydra-style CLI override strings - - Returns: - The modified configuration object with all overrides applied - - Raises: - FileNotFoundError: If the specified config_filepath does not exist - OverridesError: If there's an error parsing CLI overrides - - Example: - >>> config = load_recipe("llama3_8b") - >>> config = process_config_with_overrides( - ... config, - ... config_filepath="my_config.yaml", - ... cli_overrides=["model_config.hidden_size=4096", "training_config.lr=1e-4"] - ... ) - """ - # Convert config to OmegaConf, tracking excluded callable fields - omega_conf, excluded_fields = create_omegaconf_dict_config(config) - - # Merge YAML config file if provided - if config_filepath: - config_filepath = Path(config_filepath) - if not config_filepath.exists(): - raise FileNotFoundError(f"Config file not found: {config_filepath}") - - yaml_conf = OmegaConf.load(config_filepath) - omega_conf = OmegaConf.merge(omega_conf, yaml_conf) - logger.debug(f"Merged configuration from {config_filepath}") - - # Apply CLI overrides if provided - if cli_overrides: - omega_conf = parse_hydra_overrides(omega_conf, cli_overrides) - logger.debug(f"Applied {len(cli_overrides)} CLI overrides") - - # Convert back to dict and apply to original config object - final_config_dict = OmegaConf.to_container(omega_conf, resolve=True) - apply_overrides(config, final_config_dict, excluded_fields) - - return config - - -def parse_hydra_overrides(cfg: DictConfig, overrides: List[str]) -> DictConfig: - """Parse and apply Hydra overrides to an OmegaConf config. - - This function uses Hydra's override parser to support advanced override syntax - including additions (+), deletions (~), and complex nested operations. - - Args: - cfg: OmegaConf config to apply overrides to - overrides: List of Hydra override strings - - Returns: - Updated config with overrides applied - - Raises: - OverridesError: If there's an error parsing or applying overrides - """ - try: - OmegaConf.set_struct(cfg, True) - parser = OverridesParser.create() - parsed = parser.parse_overrides(overrides=overrides) - ConfigLoaderImpl._apply_overrides_to_config(overrides=parsed, cfg=cfg) - return cfg - except Exception as e: - raise OverridesError(f"Failed to parse Hydra overrides: {str(e)}") from e - - -class OverridesError(Exception): - """Custom exception for Hydra override parsing errors.""" - - pass - - -def _is_omegaconf_problematic(val: Any) -> bool: - """Check if a value is a callable that OmegaConf cannot handle. - - OmegaConf cannot serialize function objects, methods, or partial functions. - This function identifies such problematic callables while allowing class types. - - Args: - val: The value to check - - Returns: - True if the value is a problematic callable, False otherwise - """ - if val is None: - return False - - # Allow classes/types - if isinstance(val, type): - return False - - # Block function objects, methods, partial functions, etc. - if callable(val) or ( - hasattr(val, "__call__") - and (hasattr(val, "__module__") or hasattr(val, "__qualname__") or isinstance(val, functools.partial)) - ): - return True - - # Block arbitrary objects that are not dataclasses or safe primitives - if not isinstance( - val, (int, float, bool, str, list, tuple, dict, Path, Enum, torch.dtype) - ) and not dataclasses.is_dataclass(val): - return True - - return False - - -def _dataclass_to_omegaconf_dict(val_to_convert: Any, path: str = "") -> Any: - """Recursively convert a dataclass instance to a dictionary suitable for OmegaConf.create. - - This function completely excludes problematic callable objects to prevent OmegaConf errors. - It handles dataclasses, lists, tuples, dictionaries, and primitive types, while converting - torch.dtype objects to strings for serialization. - - Args: - val_to_convert: The value to convert - path: Current path for debugging (e.g., "model_config.activation_func") - - Returns: - Converted value suitable for OmegaConf, or _EXCLUDE_FIELD for excluded callables - """ - current_path = path - - # Handle Hugging Face GenerationConfig / PretrainedConfig by converting to a callable dict - # compatible with our YAML representer logic - try: - from transformers import GenerationConfig, PretrainedConfig # type: ignore - - if isinstance(val_to_convert, (GenerationConfig, PretrainedConfig)): - cfg_class = val_to_convert.__class__ - target = f"{inspect.getmodule(cfg_class).__name__}.{cfg_class.__qualname__}.from_dict" - logger.debug(f"Converting {cfg_class.__qualname__} at {current_path} to callable dict") - return { - "_target_": target, - "_call_": True, - "config_dict": val_to_convert.to_dict(), - } - except ModuleNotFoundError: - # transformers is optional; if unavailable, fall through to other handlers - pass - - # Explicitly handle torch.dtype - convert to string - if isinstance(val_to_convert, torch.dtype): - logger.debug(f"Converting torch.dtype at {current_path}: {val_to_convert}") - return str(val_to_convert) - - # Handle callables — serialize known activation functions as strings, - # exclude everything else. - if _is_omegaconf_problematic(val_to_convert): - field_name = current_path.rsplit(".", 1)[-1] if "." in current_path else current_path - if field_name in _SERIALIZABLE_CALLABLE_FIELDS: - str_name = callable_to_str(val_to_convert) - if str_name is not None: - logger.debug(f"Serializing callable at {current_path} as string: {str_name}") - return str_name - logger.debug(f"Excluding callable at {current_path}: {type(val_to_convert)} - {val_to_convert}") - return _EXCLUDE_FIELD - - # Handle dataclasses - elif dataclasses.is_dataclass(val_to_convert) and not isinstance(val_to_convert, type): - res = {} - for field in dataclasses.fields(val_to_convert): - field_name = field.name - field_path = f"{current_path}.{field_name}" if current_path else field_name - - try: - field_value = getattr(val_to_convert, field_name) - converted_value = _dataclass_to_omegaconf_dict(field_value, field_path) - - # Only exclude fields marked with sentinel (not legitimate None values) - if converted_value is not _EXCLUDE_FIELD: - res[field_name] = converted_value - else: - logger.debug(f"Excluded field {field_path}") - - except (AttributeError, TypeError) as e: - # Only catch specific exceptions from field access - logger.warning(f"Error processing field {field_path}: {e}") - continue - - return res - - # Handle lists - elif isinstance(val_to_convert, list): - result = [] - for i, item in enumerate(val_to_convert): - item_path = f"{current_path}[{i}]" - converted_item = _dataclass_to_omegaconf_dict(item, item_path) - - # Only exclude items marked with sentinel (not legitimate None values) - if converted_item is not _EXCLUDE_FIELD: - result.append(converted_item) - - return result - - # Handle tuples - elif isinstance(val_to_convert, tuple): - converted_items = [] - for i, item in enumerate(val_to_convert): - item_path = f"{current_path}[{i}]" - converted_item = _dataclass_to_omegaconf_dict(item, item_path) - - # Only exclude items marked with sentinel (not legitimate None values) - if converted_item is not _EXCLUDE_FIELD: - converted_items.append(converted_item) - - return tuple(converted_items) - - # Handle dictionaries - elif isinstance(val_to_convert, dict): - result = {} - for key, value in val_to_convert.items(): - key_path = f"{current_path}.{key}" if current_path else str(key) - converted_value = _dataclass_to_omegaconf_dict(value, key_path) - - # Only exclude values marked with sentinel (not legitimate None values) - if converted_value is not _EXCLUDE_FIELD: - result[key] = converted_value - - return result - - # Return primitive types as-is (including legitimate None values) - else: - return val_to_convert - - -def _track_excluded_fields(obj: Any, path: str = "") -> Dict[str, Any]: - """Track all excluded callable fields and their original values. - - This function recursively traverses a dataclass structure and builds a mapping - of field paths to their original callable values that will be excluded during - OmegaConf conversion. - - Args: - obj: The object to analyze for callable fields - path: Current path prefix for building field paths - - Returns: - Dictionary mapping field paths to their original callable values - """ - excluded_fields = {} - - if dataclasses.is_dataclass(obj) and not isinstance(obj, type): - for field in dataclasses.fields(obj): - field_name = field.name - field_path = f"{path}.{field_name}" if path else field_name - field_value = getattr(obj, field_name) - - if _is_omegaconf_problematic(field_value): - # Skip fields that are serialized as strings (not excluded) - if field_name in _SERIALIZABLE_CALLABLE_FIELDS and callable_to_str(field_value) is not None: - logger.debug(f"Skipping serializable callable (not excluded): {field_path}") - else: - excluded_fields[field_path] = field_value - logger.debug(f"Tracking excluded callable: {field_path}") - elif dataclasses.is_dataclass(field_value): - nested_excluded = _track_excluded_fields(field_value, field_path) - excluded_fields.update(nested_excluded) - elif isinstance(field_value, dict): - for key, value in field_value.items(): - if _is_omegaconf_problematic(value): - excluded_fields[f"{field_path}.{key}"] = value - - return excluded_fields - - -def _restore_excluded_fields(config_obj: Any, excluded_fields: Dict[str, Any]) -> None: - """Restore excluded callable fields to their original values. - - After applying overrides from OmegaConf, this function restores the callable - fields that were excluded during the conversion process. - - Args: - config_obj: The configuration object to restore fields on - excluded_fields: Dictionary mapping field paths to their original values - """ - for field_path, original_value in excluded_fields.items(): - try: - # Navigate to the parent object and field name - path_parts = field_path.split(".") - if path_parts[0] == "root": - path_parts = path_parts[1:] # Remove "root" prefix - - current_obj = config_obj - - # Navigate to the parent object - for part in path_parts[:-1]: - current_obj = getattr(current_obj, part) - - field_name = path_parts[-1] - - # Restore the original callable - setattr(current_obj, field_name, original_value) - logger.debug(f"Restored callable field: {field_path}") - - except (AttributeError, TypeError) as e: - logger.warning(f"Failed to restore callable field {field_path}: {e}") - - -def _verify_no_callables(obj: Any, path: str = "") -> bool: - """Recursively verify that no callable objects remain in the converted structure. - - This function is used for validation to ensure that all problematic callables - have been successfully excluded from a data structure before OmegaConf conversion. - - Args: - obj: The object to verify - path: Current path for error reporting - - Returns: - True if no problematic callables are found, False otherwise - """ - if _is_omegaconf_problematic(obj): - logger.error(f"Found problematic callable at {path}: {obj}") - return False - - elif isinstance(obj, dict): - for key, value in obj.items(): - key_path = f"{path}.{key}" if path else str(key) - if not _verify_no_callables(value, key_path): - return False - - elif isinstance(obj, (list, tuple)): - for i, item in enumerate(obj): - item_path = f"{path}[{i}]" - if not _verify_no_callables(item, item_path): - return False - - return True - - -def _apply_overrides(config_obj: DataclassInstance, overrides_dict: Dict[str, Any]) -> None: - """Recursively apply overrides from a Python dictionary to a dataclass instance. - - This function traverses nested dataclass structures and applies override values - from a dictionary. It handles type conversions for special cases like torch.dtype. - It also handles dictionaries with _target_ fields by instantiating them properly. - - Args: - config_obj: The dataclass instance to modify - overrides_dict: Dictionary of override values to apply - """ - if not dataclasses.is_dataclass(config_obj): - logger.debug(f"Skipping apply_overrides for non-dataclass config_obj: {type(config_obj)}") - return - - for key, value in overrides_dict.items(): - if not hasattr(config_obj, key): - logger.warning( - f"Key '{key}' in overrides not found in config object {type(config_obj).__name__}. Skipping." - ) - continue - - current_attr = getattr(config_obj, key) - - # Handle dictionaries with _target_ fields - if isinstance(value, dict) and "_target_" in value: - try: - from megatron.bridge.utils.instantiate_utils import instantiate - - instantiated_obj = instantiate(value) - setattr(config_obj, key, instantiated_obj) - logger.debug(f"Successfully instantiated {key} from _target_: {value['_target_']}") - continue - except Exception as e: - logger.warning(f"Failed to instantiate {key} from _target_: {e}") - - # Handle nested dataclass structures - if dataclasses.is_dataclass(current_attr) and isinstance(value, dict): - _apply_overrides(current_attr, value) - else: - try: - # Handle special case conversions if needed - final_value = value - - # If the original was a torch.dtype and value is a string, convert back - if isinstance(current_attr, torch.dtype) and isinstance(value, str): - from megatron.bridge.utils.activation_map import str_to_dtype - - try: - final_value = str_to_dtype(value) - except ValueError: - logger.warning(f"Could not convert string '{value}' back to torch.dtype") - final_value = value - - # Restore serialized callable fields (e.g. "relu" → F.relu) - if key in _SERIALIZABLE_CALLABLE_FIELDS and isinstance(final_value, str): - try: - final_value = str_to_callable(final_value) - except ValueError: - logger.warning(f"Could not restore callable for {key}='{final_value}'; keeping string") - - setattr(config_obj, key, final_value) - logger.debug(f"Set {type(config_obj).__name__}.{key} = {final_value}") - - except Exception as e: - logger.warning( - f"Could not set attribute {type(config_obj).__name__}.{key} to value '{value}'. Error: {e}" - ) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/nano/pretrain_nemotron_3_nano.py -```py -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import os -import sys -from typing import Tuple - -import torch -from omegaconf import OmegaConf - -from megatron.bridge.recipes.nemotronh.nemotron_3_nano import ( - nemotron_3_nano_pretrain_config as pretrain_config, -) -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, - parse_hydra_overrides, -) - - -logger: logging.Logger = logging.getLogger(__name__) - - -def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]: - """Parse command line arguments, separating known script args from OmegaConf overrides.""" - parser = argparse.ArgumentParser( - description="Pretrain Nemotron 3 Nano model using Megatron-Bridge with YAML and CLI overrides", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--config-file", - type=str, - help="Path to the YAML OmegaConf override file.", - ) - parser.add_argument("--per-split-data-args-path", type=str, help="Path to the per split data args file.") - - # Parse known args for the script, remaining will be treated as overrides - args, cli_dotlist_overrides = parser.parse_known_args() - return args, cli_dotlist_overrides - - -def main() -> None: - """ - Entry point for the Nemotron 3 Nano pretraining script. - """ - args, cli_overrides = parse_cli_args() - - cfg: ConfigContainer = pretrain_config( - per_split_data_args_path=args.per_split_data_args_path, - ) - - # Convert the initial Python dataclass to an OmegaConf DictConfig for merging - merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - - # Load and merge YAML overrides if a config file is provided - if args.config_file: - logger.debug(f"Loading YAML overrides from: {args.config_file}") - if not os.path.exists(args.config_file): - logger.error(f"Override YAML file not found: {args.config_file}") - sys.exit(1) - yaml_overrides_omega = OmegaConf.load(args.config_file) - merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega) - logger.debug("YAML overrides merged successfully.") - - # Apply command-line overrides using Hydra-style parsing - if cli_overrides: - logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}") - merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides) - logger.debug("Hydra-style command-line overrides applied successfully.") - - # Apply the final merged OmegaConf configuration back to the original ConfigContainer - logger.debug("Applying final merged configuration back to Python ConfigContainer...") - final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) - # Apply overrides while preserving excluded fields - apply_overrides(cfg, final_overrides_as_dict, excluded_fields) - - # Start training - logger.debug("Starting pretraining...") - pretrain(config=cfg, forward_step_func=forward_step) - - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - -if __name__ == "__main__": - main() - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/peft.md -```md -# Parameter-Efficient Fine-Tuning (PEFT) - -This guide explains how to configure and use PEFT in Megatron Bridge—covering LoRA and DoRA, required checkpoints, example configurations, and the internal design and training workflow—so you can integrate, scale, and checkpoint adapters efficiently. - -## Model Customization -Customizing models enables you to adapt a general pre-trained model to a specific use case or domain. This process produces a fine-tuned model that retains the broad knowledge from pretraining while delivering more accurate outputs for targeted downstream tasks. - -Model customization is typically achieved through supervised fine-tuning, which falls into two main approaches: Full-Parameter Fine-Tuning, known as Supervised Fine-Tuning (SFT), and Parameter-Efficient Fine-Tuning (PEFT). - -In SFT, all model parameters are updated to align the model’s outputs with the task-specific requirements. This approach often yields the highest performance but can be computationally intensive. - -PEFT, by contrast, updates only a small subset of parameters that are inserted into the base model at strategic locations. The base model weights remain frozen, and only the adapter modules are trained. This significantly reduces the number of trainable parameters—often to less than 1%—while still achieving near-SFT levels of accuracy. - -As language models continue to grow in size, PEFT is gaining popularity for its efficiency and minimal hardware demands, making it a practical choice for many real-world applications. - -## PEFT Configuration - -PEFT is configured as an optional attribute in `ConfigContainer`: - -```python -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.peft.lora import LoRA - -config = ConfigContainer( - # ... other required configurations - peft=LoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=16, - alpha=32, - dropout=0.1, - ), - checkpoint=CheckpointConfig( - pretrained_checkpoint="/path/to/pretrained/checkpoint", # Required for PEFT - save="/path/to/peft/checkpoints", - ), -) -``` - -```{note} -**Requirements**: PEFT requires `checkpoint.pretrained_checkpoint` to be set to load the base model weights. -``` - -## Supported PEFT Methods - -### [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) - -LoRA makes fine-tuning efficient by representing weight updates with two low-rank decomposition matrices. The original model weights remain frozen, while the low-rank decomposition matrices are updated to adapt to the new data, keeping the number of trainable parameters low. In contrast with adapters, the original model weights and adapted weights can be combined during inference, avoiding any architectural change or additional latency in the model at inference time. - -In Megatron Bridge, you can configure both the adapter bottleneck dimension and the target modules where LoRA is applied. LoRA supports any linear layer, which in transformer models typically includes: - -1. Query, key, and value (QKV) attention projections -2. The attention output projection -3. One or both MLP layers - -Megatron Bridge fuses the QKV projections into a single linear layer. As a result, LoRA learns a unified low-rank adaptation for the combined QKV representation. - -```python -from megatron.bridge.peft.lora import LoRA - -lora_config = LoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=16, # Rank of adaptation - alpha=32, # Scaling parameter - dropout=0.1, # Dropout rate -) -``` - -#### Key Parameters -The following table lists key hyperparameters for configuring DoRA, which control its module targeting, adaptation rank, scaling behavior, and regularization strategy. -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `target_modules` | `List[str]` | All linear layers | Modules to apply DoRA to | -| `dim` | `int` | `32` | Rank of the low-rank adaptation | -| `alpha` | `float` | `16` | Scaling parameter for DoRA | -| `dropout` | `float` | `0.0` | Dropout rate for DoRA layers | - -#### Target Modules -The following table lists specific submodules within transformer architectures that are commonly targeted for LoRA, enabling efficient fine-tuning of attention and feedforward components: -| Module | Description | -|---------------|---------------------------------------------| -| `linear_qkv` | Query, key, value projections in attention | -| `linear_proj` | Attention output projection | -| `linear_fc1` | First MLP layer | -| `linear_fc2` | Second MLP layer | - -#### Wildcard Target Modules -For more granular targeting, individual layers can be targeted for the adapters. -```python -# Target specific layers only -lora_config = LoRA( - target_modules=[ - "*.layers.0.*.linear_qkv", # First layer only - "*.layers.1.*.linear_qkv", # Second layer only - ] -) -``` - -### Canonical LoRA: Performant vs Canonical Variants - -There are two variants of LoRA implemented in Megatron Bridge: "performant LoRA" (`LoRA`) and "canonical LoRA" (`CanonicalLoRA`). - -The distinction comes from the fact that Megatron Core optimizes the implementation of the following two linear modules by fusing multiple linear layers into one layer. When these layers are adapted with LoRA, the performant version also uses only one adapter for the linear module. The two linear modules are: - -1. `linear_qkv`: The projection matrix in self attention that transforms hidden state to query, key and value. Megatron Core fuses these three projection matrices into a single matrix to efficiently parallelize the matrix multiplication. Hence, performant LoRA applies a single adapter to the qkv projection matrix, whereas canonical LoRA applies three adapters. -2. `linear_fc1`: The first linear layer in the MLP module before the intermediate activation. For gated linear activations, Megatron Core fuses the up and gate projection matrices into a single matrix for efficient parallelization. Hence, performant LoRA applies a single adapter to the up and gate projection matrices, whereas canonical LoRA applies two adapters. - -The following two figures illustrate the difference between canonical and performant LoRA, using the `linear_qkv` layer as an example. Canonical LoRA runs three adapters sequentially, while performant LoRA runs one adapter. - -```{image} images/canonical_lora.png -:width: 640 -:align: center -``` - -```{image} images/performant_lora.png -:width: 400 -:align: center -``` - -Canonical LoRA conforms more closely to reference implementations, though it is slower in comparison since it performs several matrix multiplications sequentially, as described above. Performant LoRA has fewer parameters than canonical LoRA and can often achieve the same level of accuracy as canonical LoRA. - -Though not immediately apparent, performant LoRA is mathematically equivalent to canonical LoRA when the $A_q$, $A_k$, $A_v$ matrices are tied (i.e. forced to share the same weight during training) in `linear_qkv`, and similarly when the $A_{up}$, $A_{gate}$ matrices are tied in `linear_fc1`. - -```{admonition} Mathematical Proof: Performant LoRA Equivalence to Canonical LoRA with Tied Weights -:class: dropdown - -Let $[x \quad y]$ denote matrix concatenation. (In Megatron Bridge, this concatenation is done in an interleaved fashion, but this does not affect the proof below.) - -Let $A_q = A_k = A_v = A_{qkv}$ (weight tying) - -Then - -$$ -\begin{align} -& [query \quad key \quad value] \\ -= & [W_q x + B_q A_q x \quad W_k x + B_k A_k x \quad W_v x + B_v A_v x] \quad\quad \text{(canonical formulation)} \\ -= & [W_q x + B_q (A_{qkv} x) \quad W_k x + B_k (A_{qkv} x) \quad W_v x + B_v (A_{qkv} x)] \\ -= & [W_q \quad W_k \quad W_v] x + [B_q \quad B_k \quad B_v]A_{qkv} x \\ -= & W_{qkv} x + B_{qkv} A_{qkv} x \quad\quad \text{(performant formulation)} -\end{align} -$$ - -Note: dimensions of weight matrices are as follows: - -$$ -\begin{align} -W_q: &\ n_q d \times h \qquad & A_q: &\ r \times h \qquad & B_q: &\ n_q d \times r \\ -W_k: &\ n_{kv} d \times h \qquad & A_k: &\ r \times h \qquad & B_k: &\ n_{kv} d \times r \\ -W_v: &\ n_{kv} d \times h \qquad & A_v: &\ r \times h \qquad & B_v: &\ n_{kv} d \times r \\ -W_{qkv}: &\ (n_q+2n_{kv})d \times h \qquad & A_{qkv}: &\ r \times h \qquad & B_{qkv}: &\ (n_q+2n_{kv})d \times r -\end{align} -$$ - -Where: -- $n_q$: Number of attention heads (`num_attention_heads`). -- $n_{kv}$: Number of key value heads (`num_query_groups`). Note that if grouped query attention (GQA) is not used, $n_{kv} = n_q$. -- $h$: Transformer hidden size (`hidden_size`). -- $d$: Transformer head dimension (`kv_channels`). -- $r$: LoRA rank. - -``` - -#### Using Canonical LoRA - -```python -from megatron.bridge.peft.canonical_lora import CanonicalLoRA - -canonical_lora_config = CanonicalLoRA( - target_modules=[ - "linear_q", "linear_k", "linear_v", # Individual Q, K, V projections - "linear_proj", # Attention output projection - "linear_fc1_up", "linear_fc1_gate", # Individual up and gate projections - "linear_fc2" # Second MLP layer - ], - dim=16, # Rank of adaptation - alpha=32, # Scaling parameter - dropout=0.1, # Dropout rate -) -``` - -#### Key Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `target_modules` | `List[str]` | All canonical linear layers | Modules to apply canonical LoRA to | -| `dim` | `int` | `32` | Rank of the low-rank adaptation | -| `alpha` | `float` | `32` | Scaling parameter for LoRA | -| `dropout` | `float` | `0.0` | Dropout rate for LoRA layers | -| `dropout_position` | `Literal["pre", "post"]` | `"pre"` | Position for applying dropout | -| `lora_A_init_method` | `str` | `"xavier"` | Initialization method for LoRA A matrix | -| `lora_B_init_method` | `str` | `"zero"` | Initialization method for LoRA B matrix | - -#### Target Modules for Canonical LoRA - -The following table lists specific submodules within transformer architectures that are targeted for canonical LoRA: - -| Module | Description | -|--------|-------------| -| `linear_q` | Query projection in attention | -| `linear_k` | Key projection in attention | -| `linear_v` | Value projection in attention | -| `linear_proj` | Attention output projection | -| `linear_fc1_up` | Up projection in MLP | -| `linear_fc1_gate` | Gate projection in MLP | -| `linear_fc2` | Second MLP layer | - -```{note} -Canonical LoRA does not support `linear_qkv` or `linear_fc1` targets. Use the individual component targets (`linear_q`, `linear_k`, `linear_v` for QKV and `linear_fc1_up`, `linear_fc1_gate` for FC1) instead. -``` - -### [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353) - -DoRA decomposes the pre-trained weight into magnitude and direction. It learns a separate magnitude parameter while employing LoRA for directional updates, efficiently minimizing the number of trainable parameters. DoRA enhances both the learning capacity and training stability of LoRA, while avoiding any additional inference overhead. DoRA has been shown to consistently outperform LoRA on various downstream tasks. - -In Megatron Bridge, DoRA leverages the same adapter structure as LoRA. Megatron Bridge adds support for Tensor Parallelism and Pipeline Parallelism for DoRA, enabling DoRA to be scaled to larger model variants. - -```python -from megatron.bridge.peft.dora import DoRA - -dora_config = DoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=16, # Rank of adaptation - alpha=32, # Scaling parameter - dropout=0.1, # Dropout rate -) -``` - -#### Key Parameters - -The following parameters define how LoRA is applied to your model. They control which modules are targeted, the adaptation rank, scaling behavior, and dropout configuration: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `target_modules` | `List[str]` | All linear layers | Modules to apply DoRA to | -| `dim` | `int` | `32` | Rank of the low-rank adaptation | -| `alpha` | `float` | `16` | Scaling parameter for DoRA | -| `dropout` | `float` | `0.0` | Dropout rate for DoRA layers | - -## Full Configuration Example - -```python -from megatron.bridge.training.config import ( - ConfigContainer, TrainingConfig, CheckpointConfig -) -from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig -from megatron.bridge.data.hf_processors.squad import process_squad_example -from megatron.bridge.peft.lora import LoRA -from megatron.core.optimizer import OptimizerConfig - -# Configure PEFT fine-tuning -config = ConfigContainer( - model=model_provider, - train=TrainingConfig( - train_iters=1000, - global_batch_size=64, - micro_batch_size=1, # Required for packed sequences if used - eval_interval=100, - ), - optimizer=OptimizerConfig( - optimizer="adam", - lr=1e-4, # Lower learning rate for fine-tuning - weight_decay=0.01, - bf16=True, - use_distributed_optimizer=True, - ), - scheduler=SchedulerConfig( - lr_decay_style="cosine", - lr_warmup_iters=100, - lr_decay_iters=1000, - ), - dataset=HFDatasetConfig( - dataset_name="squad", - process_example_fn=process_squad_example, - seq_length=512, - ), - checkpoint=CheckpointConfig( - pretrained_checkpoint="/path/to/pretrained/model", # Required - save="/path/to/peft/checkpoints", - save_interval=200, - ), - peft=LoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=16, - alpha=32, - dropout=0.1, - ), - # ... other configurations -) -``` - -## PEFT Design in Megatron Bridge - -This section describes the internal design and architecture for how PEFT is integrated into Megatron Bridge. - -### Architecture Overview - -The PEFT framework introduces a modular design for integrating adapters into large-scale models. Its architecture consists of the following components: - -1. **Base PEFT Class**: All PEFT methods inherit from the abstract {py:class}`bridge.peft.base.PEFT` base class, which defines the core interface for module transformation. -2. **Module Transformation**: PEFT traverses the model structure to identify and transform target modules individually. -3. **Adapter Integration**: Adapters are injected into selected modules using a pre-wrap hook during model initialization. -4. **Checkpoint Integration**: Only adapter parameters are saved and loaded during checkpointing; base model weights remain frozen and unchanged. - -### PEFT Workflow in Training - -The training workflow for PEFT follows a structured sequence that ensures efficient fine-tuning with minimal overhead: -1. **Model Loading**: The base model is initialized from a specified pretrained checkpoint. -2. **PEFT Application**: Adapter transformations are applied after Megatron Core model initialization, but before distributed wrapping. -3. **Parameter Freezing**: Base model parameters are frozen to reduce training complexity; only adapter parameters are updated. -4. **Adapter Weight Loading**: When resuming training, adapter weights are restored from the checkpoint. -5. **Checkpoint Saving**: Only adapter states are saved, resulting in significantly smaller checkpoint files. - -### Key Benefits - -PEFT offers several advantages for scalable and efficient model fine-tuning: - -- **Reduced Checkpoint Size**: Adapter-only checkpoints are dramatically smaller than full model checkpoints. -- **Memory Efficiency**: Since gradients are computed only for adapter parameters, memory usage is significantly reduced. -- **Resume Support**: Training can be resumed seamlessly using adapter-only checkpoints, without reloading full model weights. -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/gpt_oss/slurm_pretrain.sh -```sh -#!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ============================================================================== -# GPT-OSS 20B Pretraining -# -# GPT-OSS 20B is an MoE language model. Supports multiple parallelism configs: -# each "TP,PP,EP,CP,SP" runs sequentially. -# -# Usage: -# 1. Modify the #SBATCH directives below for your cluster -# 2. Set CONTAINER_IMAGE to your container path -# 3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled) -# 4. Submit: sbatch slurm_pretrain.sh -# ============================================================================== - -#SBATCH --job-name=gpt-oss-pretrain -#SBATCH --nodes=4 -#SBATCH --ntasks-per-node=8 # Change to 4 for GB200 (Blackwell, 4 GPUs/node) -#SBATCH --gpus-per-node=8 # Change to 4 for GB200 (Blackwell, 4 GPUs/node) -#SBATCH --time=24:00:00 -#SBATCH --partition=batch -#SBATCH --account=my_account -#SBATCH --output=logs/gpt_oss_pretrain_%j.out -#SBATCH --error=logs/gpt_oss_pretrain_%j.err -#SBATCH --exclusive - -# ============================================================================== -# CONFIGURATION -# ============================================================================== - -# Workspace directory for checkpoints and results -WORKSPACE=${WORKSPACE:-/workspace} - -# Base directory for container image and mounts (set if not already set, e.g. by launch_nemo.sh) -export WKDIR="${WKDIR:-}" - -# Model and training configurations -MODEL_NAME=gpt_oss_20b -RECIPE_NAME="${RECIPE_NAME:-${MODEL_NAME}_pretrain_config}" # bf16 (default) -# RECIPE_NAME="${MODEL_NAME}_pretrain_fp8_current_scaling_config" # Hopper FP8 current scaling -# RECIPE_NAME="${MODEL_NAME}_pretrain_mxfp8_config" # Blackwell MXFP8 -DATASET_NAME=dclm # set to "mock" for mock data; "dclm" uses DCLM when DCLM_DATA_DIR/DCLM_CACHE are set below -SEQ_LENGTH=4096 - -# When DATASET_NAME=dclm, set DCLM_DATA_DIR and DCLM_CACHE so the recipe uses DCLM; leave unset for mock -if [ "$DATASET_NAME" = "dclm" ]; then - # export DCLM_DATA_DIR="/path/to/dclm/preprocessed" - # export DCLM_CACHE="/path/to/cache" - : -else - unset DCLM_DATA_DIR - unset DCLM_CACHE -fi - -TRAIN_ITERS=1000 -GLOBAL_BATCH_SIZE=128 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 -LR_WARMUP_ITERS=50 -LOG_INTERVAL=1 -WANDB_PROJECT=megatron-bridge-${DATASET_NAME} - -# Parallelism configs: "TP,PP,EP,CP,SP" per entry (max(TP*CP, EP)*PP must be divisible by the total number of GPUs) -PARALLELISM_CONFIGS=("2,4,4,1,True" "4,2,4,1,True" "2,4,4,2,True") - -# Container image (required) -CONTAINER_IMAGE="" -# CONTAINER_IMAGE="/path/to/container.sqsh" - -# Container mounts (optional; comma-separated for srun --container-mounts) -CONTAINER_MOUNTS="" -# CONTAINER_MOUNTS="/data:/data /workspace:/workspace" - -# ============================================================================== -# Environment Setup -# ============================================================================== - -# NCCL optimizations for large-scale training -export TORCH_NCCL_AVOID_RECORD_STREAMS=1 -export NCCL_NVLS_ENABLE=0 - -# UV cache on shared filesystem (recommended for multi-node setups) -# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync -# export UV_CACHE_DIR="/path/to/shared/uv_cache" - -# HuggingFace cache directory (recommended for shared filesystem) -# export HF_HOME="/path/to/shared/HF_HOME" - -# Authentication tokens (set these for your environment) -# export HF_TOKEN="hf_your_token_here" -# export WANDB_API_KEY="your_wandb_key_here" - -# ============================================================================== -# Job Execution -# ============================================================================== - -echo "======================================" -echo "GPT-OSS 20B Pretraining Job" -echo "======================================" -echo "Job ID: $SLURM_JOB_ID" -echo "Nodes: $SLURM_JOB_NUM_NODES" -echo "GPUs per node: $SLURM_GPUS_PER_NODE" -echo "Model: $MODEL_NAME" -echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}" -echo "======================================" - -# Create logs directory if it doesn't exist -mkdir -p logs - -# Require container image -if [ -z "$CONTAINER_IMAGE" ]; then - echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image." - exit 1 -fi - -# Build srun command (shared across configs) -SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE" -if [ -n "$CONTAINER_MOUNTS" ]; then - SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS" -fi -echo "SRUN base: $SRUN_CMD" -echo "======================================" - -# If using DCLM, pass dataset config via CLI overrides -DCLM_DATASET_OVERRIDES="" -if [ -n "${DCLM_DATA_DIR:-}" ] && [ -n "${DCLM_CACHE:-}" ]; then - BLEND_PATHS="" - for i in $(seq 1 10); do - pad=$(printf "%02d" $i) - PREFIX="${DCLM_DATA_DIR}/dclm_01_${pad}_text_document" - if [ -f "${PREFIX}.bin" ]; then - BLEND_PATHS="${BLEND_PATHS}\"${PREFIX}\"," - fi - done - BLEND_PATHS="${BLEND_PATHS%,}" - - if [ -n "$BLEND_PATHS" ]; then - DCLM_DATASET_OVERRIDES="dataset.blend=[[${BLEND_PATHS}],null] dataset.split='\"9999,8,2\"' dataset.path_to_cache=${DCLM_CACHE}" - else - echo "WARNING: No DCLM data found in ${DCLM_DATA_DIR}!" - fi -fi - -# Run each parallelism config in sequence -export CUDA_DEVICE_MAX_CONNECTIONS=1 -CONFIG_INDEX=0 -for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do - OLD_IFS=$IFS - IFS=',' read -r TP PP EP CP SP <<< "$CONFIG" - IFS=$OLD_IFS - - CONFIG_INDEX=$((CONFIG_INDEX + 1)) - - echo "" - echo "======================================" - echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP" - echo "======================================" - - # Build CLI overrides for this config - CLI_OVERRIDES=" \ - model.seq_length=$SEQ_LENGTH \ - train.train_iters=$TRAIN_ITERS \ - train.global_batch_size=$GLOBAL_BATCH_SIZE \ - train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ - scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \ - logger.log_interval=$LOG_INTERVAL \ - logger.wandb_project=$WANDB_PROJECT \ - logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \ - dataset.sequence_length=$SEQ_LENGTH \ - model.tensor_model_parallel_size=$TP \ - model.pipeline_model_parallel_size=$PP \ - model.expert_model_parallel_size=$EP \ - model.sequence_parallel=$SP \ - model.context_parallel_size=$CP \ - " - if [ -n "$DCLM_DATASET_OVERRIDES" ]; then - CLI_OVERRIDES="$CLI_OVERRIDES $DCLM_DATASET_OVERRIDES" - fi - CMD="uv run --no-sync python /opt/Megatron-Bridge/scripts/training/run_recipe.py" - CMD="$CMD --recipe ${RECIPE_NAME}" - CMD="$CMD $CLI_OVERRIDES" - - echo "Executing command..." - echo "$CMD" - echo "======================================" - - $SRUN_CMD bash -c "$CMD" - RUN_EXIT=$? - if [ $RUN_EXIT -ne 0 ]; then - echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT" - continue - fi -done - -echo "======================================" -echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)" -echo "======================================" - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/logging.md -```md -# Logging and Monitoring - -This guide describes how to configure logging in Megatron Bridge. It introduces the high-level `LoggerConfig`, explains experiment logging to TensorBoard and Weights & Biases (W&B), and documents console logging behavior. - -## LoggerConfig Overview - -{py:class}`~bridge.training.config.LoggerConfig` is the dataclass that encapsulates logging‑related settings for training. It resides inside the overall {py:class}`bridge.training.config.ConfigContainer`, which represents the complete configuration for a training run. - -### Timer Configuration Options - -Use the following options to control which timing metrics are collected during training and how they are aggregated and logged. - -#### `timing_log_level` -Controls which timers are recorded during execution: - -- **Level 0**: Logs only the overall iteration time. -- **Level 1**: Includes once-per-iteration operations, such as gradient all-reduce. -- **Level 2**: Captures frequently executed operations, providing more detailed insights but with increased overhead. - -#### `timing_log_option` -Specifies how timer values are aggregated across ranks. Valid options: - -- `"max"`: Logs the maximum value across ranks. -- `"minmax"`: Logs both minimum and maximum values. -- `"all"`: Logs all values from all ranks. - -#### `log_timers_to_tensorboard` -When enabled, the framework records timer metrics to supported backends such as TensorBoard. - - -### Diagnostic Options - -The framework provides several optional toggles for enhanced monitoring and diagnostics: - -- **Loss Scale**: Enables dynamic loss scaling for mixed-precision training. -- **Validation Perplexity**: Tracks model perplexity during validation. -- **CUDA Memory Statistics**: Reports detailed GPU memory usage. -- **World Size**: Displays the total number of distributed ranks. - -### Logging Options - -Use the following options to enable additional diagnostics and performance monitoring during training. - -- **`log_params_norm`**: Computes and logs the L2 norm of model parameters. If available, it also logs the gradient norm. -- **`log_energy`**: Activates the energy monitor, which records per-GPU energy consumption and instantaneous power usage. -- **`log_memory`**: Logs the memory usage of the model from `torch.cuda.memory_stats()`. -- **`log_throughput_to_tensorboard`**: Calculates the training throughput and utilization. -- **`log_runtime_to_tensorboard`**: Estimates total time remaining until the end of the training. -- **`log_l2_norm_grad_to_tensorboard`**: Computes and logs the L2 norm of gradients for each model layer. - - -## Experiment Logging -Both TensorBoard and W&B are supported for metric logging. When using W&B, it’s recommended to also enable TensorBoard to ensure that all scalar metrics are consistently logged across backends. - -### TensorBoard - - -#### What Gets Logged - -TensorBoard captures a range of training and system metrics, including: - -- **Learning rate**, including decoupled LR when applicable -- **Per-loss scalars** for detailed breakdowns -- **Batch size** and **loss scale** -- **CUDA memory usage** and **world size** (if enabled) -- **Validation loss**, with optional **perplexity** -- **Timers**, when timing is enabled -- **Energy consumption** and **instantaneous power**, if energy logging is active - - -#### Enable TensorBoard Logging - 1) Install TensorBoard (if not already available): - ```bash - pip install tensorboard - ``` - 2) **Configure logging** in your training setup. In these examples, `cfg` refers to a `ConfigContainer` instance (such as one produced by a recipe), which contains a `logger` attribute representing the `LoggerConfig`: - - ```python - from megatron.bridge.training.config import LoggerConfig - - cfg.logger = LoggerConfig( - tensorboard_dir="./runs/tensorboard", - tensorboard_log_interval=10, - log_timers_to_tensorboard=True, # optional - log_memory_to_tensorboard=False, # optional - ) - ``` - - ```{note} - The writer is created lazily on the last rank when `tensorboard_dir` is set. - ``` - -#### Set the Output Directory - -TensorBoard event files are saved to the directory specified by `tensorboard_dir`. - -**Example with additional metrics enabled:** -```python -cfg.logger.tensorboard_dir = "./logs/tb" -cfg.logger.tensorboard_log_interval = 5 -cfg.logger.log_loss_scale_to_tensorboard = True -cfg.logger.log_validation_ppl_to_tensorboard = True -cfg.logger.log_world_size_to_tensorboard = True -cfg.logger.log_timers_to_tensorboard = True -``` - -### Weights & Biases (W&B) - - -#### What Gets Logged - -When enabled, W&B automatically mirrors the scalar metrics logged to TensorBoard. -In addition, the full run configuration is synced at initialization, allowing for reproducibility and experiment tracking. - - -#### Enable W&B Logging - - 1) Install W&B (if not already available): - ```bash - pip install wandb - ``` - 2) Authenticate with W&B using one of the following methods: - - Set `WANDB_API_KEY` in the environment before the run, or - - Run `wandb login` once on the machine. - 2) **Configure logging** in your training setup. In these examples, `cfg` refers to a `ConfigContainer` instance (such as one produced by a recipe), which contains a `logger` attribute representing the `LoggerConfig`: - - ```python - from megatron.bridge.training.config import LoggerConfig - - cfg.logger = LoggerConfig( - tensorboard_dir="./runs/tensorboard", # recommended: enables shared logging gate - wandb_project="my_project", - wandb_exp_name="my_experiment", - wandb_entity="my_team", # optional - wandb_save_dir="./runs/wandb", # optional - ) - ``` - -```{note} -W&B is initialized lazily on the last rank when `wandb_project` is set and `wandb_exp_name` is non-empty. -``` - -#### W&B Configuration with NeMo Run Launching - -For users launching training scripts with NeMo Run, W&B can be optionally configured using the {py:class}`bridge.recipes.run_plugins.WandbPlugin`. - -The plugin automatically forwards the `WANDB_API_KEY` and by default injects CLI overrides for the following logger parameters: - -- `logger.wandb_project` -- `logger.wandb_entity` -- `logger.wandb_exp_name` -- `logger.wandb_save_dir` - -This allows seamless integration of W&B logging into your training workflow without manual configuration. - - -### MLFlow - -Megatron Bridge can log metrics and artifacts to MLFlow, following the same pattern as the W&B integration. - -#### What Gets Logged - -When enabled, MLFlow receives: - -- Training configuration as run parameters -- Scalar metrics (losses, learning rate, batch size, throughput, timers, memory, runtime, norms, energy, etc.) -- Checkpoint artifacts saved under an experiment-specific artifact path per iteration - -#### Enable MLFlow Logging - - 1) Install MLFlow (installed by default with Megatron Bridge): - - ```bash - pip install mlflow / uv add mlflow - ``` - - 2) Configure the tracking server (Optional): - - Either set `MLFLOW_TRACKING_URI` in the environment, or - - Pass an explicit `mlflow_tracking_uri` in the logger config. - - 3) Configure logging in your training setup. - - ```python - from megatron.bridge.training.config import LoggerConfig - - cfg.logger = LoggerConfig( - tensorboard_dir="./runs/tensorboard", - mlflow_experiment="my_megatron_experiment", - mlflow_run_name="llama32_1b_pretrain_run", - mlflow_tracking_uri="http://mlflow:5000", # optional - mlflow_tags={ # optional - "project": "llama32", - "phase": "pretrain", - }, - ) - ``` - - - -### Comet ML - -Megatron Bridge can log metrics and experiment metadata to Comet ML, following the same pattern as the W&B and MLFlow integrations. - -#### What Gets Logged - -When enabled, Comet ML receives: - -- Training configuration as experiment parameters -- Scalar metrics (losses, learning rate, batch size, throughput, timers, memory, runtime, norms, energy, etc.) -- Validation loss and perplexity metrics -- Checkpoint save/load metadata - -#### Enable Comet ML Logging - - 1) Install Comet ML: - - ```bash - pip install comet-ml - ``` - - 2) Authenticate: - - Either set `COMET_API_KEY` in the environment, or - - Pass an explicit `comet_api_key` in the logger config. - - 3) Configure logging in your training setup. - - ```python - from megatron.bridge.training.config import LoggerConfig - - cfg.logger = LoggerConfig( - tensorboard_dir="./runs/tensorboard", - comet_project="my_project", - comet_experiment_name="llama32_1b_pretrain_run", - comet_workspace="my_workspace", # optional - comet_tags=["pretrain", "llama32"], # optional - ) - ``` - -```{note} -Comet ML is initialized lazily on the last rank when `comet_project` is set and `comet_experiment_name` is non-empty. -``` - -#### Comet ML Configuration with NeMo Run Launching - -For users launching training scripts with NeMo Run, Comet ML can be optionally configured using the {py:class}`bridge.recipes.run_plugins.CometPlugin`. - -The plugin automatically forwards the `COMET_API_KEY` and by default injects CLI overrides for the following logger parameters: - -- `logger.comet_project` -- `logger.comet_workspace` -- `logger.comet_experiment_name` - - -#### Progress Log - -When `logger.log_progress` is enabled, the framework generates a `progress.txt` file in the checkpoint save directory. - -This file includes: -- **Job-level metadata**, such as timestamp and GPU count -- **Periodic progress entries** throughout training - -At each checkpoint boundary, the log is updated with: -- **Job throughput** (TFLOP/s/GPU) -- **Cumulative throughput** -- **Total floating-point operations** -- **Tokens processed** - -This provides a lightweight, text-based audit trail of training progress, useful for tracking performance across restarts. - - -## Tensor Inspection - -Megatron Bridge integrates with TransformerEngine's tensor inspection features via NVIDIA DLFW Inspect. This integration, controlled by {py:class}`~bridge.training.config.TensorInspectConfig`, enables advanced debugging and analysis of tensor statistics during training. When enabled, the framework handles initialization, step tracking, and cleanup automatically. - -```{note} -**Current limitations:** Tensor inspection is currently supported only for linear modules in TransformerEngine (e.g., `fc1`, `fc2`, `layernorm_linear`). Operations like attention are not supported. -``` - -```{note} -This section covers Megatron Bridge configuration. For comprehensive documentation on features, configuration syntax, and advanced usage, see: - -- [TransformerEngine Debug Documentation](https://github.com/NVIDIA/TransformerEngine/tree/af2a0c16ec11363c0af84690cd877a59f898820e/docs/debug) -- [NVIDIA DLFW Inspect Documentation](https://github.com/NVIDIA/nvidia-dlfw-inspect/tree/4118044cc84f0183714a2ab1bc215fa49f9aaa82/docs) -``` - -### Installation - -Install NVIDIA DLFW Inspect if not already available: -```bash -pip install nvdlfw-inspect -``` - -### Available Features - -TransformerEngine provides the following debug features: - -- **LogTensorStats** – Logs high-precision tensor statistics: `min`, `max`, `mean`, `std`, `l1_norm`, `l2_norm`, `cur_amax`, `dynamic_range`. -- **LogFp8TensorStats** – Logs quantized tensor statistics for FP8 recipes: `underflows%`, `scale_inv_min`, `scale_inv_max`, `mse`. Supports simulating alternative recipes (e.g., tracking `mxfp8_underflows%` during per-tensor current-scaling training) -- **DisableFP8GEMM** – Runs specific GEMM operations in high precision -- **DisableFP8Layer** – Disables FP8 for entire layers -- **PerTensorScaling** – Enables per-tensor current scaling for specific tensors -- **FakeQuant** – Experimental quantization testing - -See [TransformerEngine debug features](https://github.com/NVIDIA/TransformerEngine/tree/af2a0c16ec11363c0af84690cd877a59f898820e/transformer_engine/debug/features) for complete parameter lists and usage details. - -### Configuration - -Configure tensor inspection using {py:class}`~bridge.training.config.TensorInspectConfig` with either a YAML file or inline dictionary. - -#### YAML Configuration - -```yaml -tensor_inspect: - enabled: true - features: ./conf/fp8_tensor_stats.yaml - log_dir: ./logs/tensor_inspect -``` - -**Example feature configuration file:** - -```yaml -fp8_tensor_stats: - enabled: true - layers: - layer_name_regex_pattern: ".*(fc2)" - transformer_engine: - LogFp8TensorStats: - enabled: true - tensors: [weight,activation,gradient] - stats: ["underflows%", "mse"] - freq: 5 - start_step: 0 - end_step: 100 -``` - -#### Python Configuration - -```python -from bridge.training.config import TensorInspectConfig - -# Option 1: inline python dict -cfg.tensor_inspect = TensorInspectConfig( - enabled=True, - features={ - "fp8_gradient_stats": { - "enabled": True, - "layers": {"layer_name_regex_pattern": ".*(fc1|fc2)"}, - "transformer_engine": { - "LogFp8TensorStats": { - "enabled": True, - "tensors": ["weight","activation","gradient"], - "stats": ["underflows%", "mse"], - "freq": 5, - "start_step": 0, - "end_step": 100, - }, - }, - } - }, - log_dir="./logs/tensor_inspect", -) - -# Option 2: reference external YAML -cfg.tensor_inspect = TensorInspectConfig( - enabled=True, - features="./conf/fp8_inspect.yaml", - log_dir="./logs/tensor_inspect", -) - -``` - -#### Layer Selection - -Features apply to linear modules matched by selectors in the `layers` section: - -- `layer_name_regex_pattern: .*` – All supported linear layers -- `layer_name_regex_pattern: .*layers\.(0|1|2).*(fc1|fc2|layernorm_linear)` – Linear modules in first three transformer layers -- `layer_name_regex_pattern: .*(fc1|fc2)` – MLP projections only -- `layer_types: [layernorm_linear, fc1]` – String matching (alternative to regex) - -Tensor-level selectors (`tensors`, `tensors_struct`) control which tensor roles are logged: `activation`, `gradient`, `weight`, `output`, `wgrad`, `dgrad`. - -### Output and Monitoring - -Tensor statistics are written to `tensor_inspect.log_dir` and forwarded to TensorBoard/W&B when enabled. - -**Log locations:** -- Text logs: `/nvdlfw_inspect_statistics_logs/` -- TensorBoard -- W&B - -### Performance Considerations - -- Use `freq > 1` to reduce overhead. Statistics collection is expensive for large models. -- Narrow layer selection with specific regex patterns rather than `.*` - - -## Console Logging - -Megatron Bridge uses the standard Python logging subsystem for console output. - -### Configure Console Logging - -To control console logging behavior, use the following configuration options: - -- `logging_level` sets the default verbosity level. It can be overridden via the `MEGATRON_BRIDGE_LOGGING_LEVEL` environment variable. -- `filter_warnings` suppresses messages at the WARNING level. -- `modules_to_filter` specifies logger name prefixes to exclude from output. -- `set_level_for_all_loggers` determines whether the logging level is applied to all loggers or only a subset, depending on the current implementation. - - -### Monitor Logging Cadence and Content - -To monitor training progress at regular intervals, the framework prints a summary line every `log_interval` iterations. - -Each summary includes: -- **Timestamp** -- **Iteration counters** -- **Consumed and skipped samples** -- **Iteration time (ms)** -- **Learning rates** -- **Global batch size** -- **Per-loss averages** -- **Loss scale** - -When enabled, additional metrics are printed: -- **Gradient norm** -- **Zeros in gradients** -- **Parameter norm** -- **Energy and power per GPU** - -Straggler timing reports follow the same `log_interval` cadence, helping identify performance bottlenecks across ranks. - - -### Minimize Timing Overhead - -To reduce performance impact, set `timing_log_level` to `0`. -Increase to `1` or `2` only when more detailed timing metrics are required, as higher levels introduce additional logging overhead. - - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/mixed-precision.md -```md -# Mixed Precision Training - -Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. Megatron Bridge supports FP16, BF16, and FP8 via Transformer Engine (TE) across most models through the {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` configuration. - -## Configuration Overview - -Mixed precision is configured in Megatron Bridge through the `mixed_precision` field in {py:class}`bridge.training.config.ConfigContainer`, which accepts either: -- A string name referencing a predefined recipe (e.g., `"bf16_mixed"`) -- A {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` object for custom configurations - -The mixed precision configuration automatically updates the model, optimizer, and distributed data parallel settings with the appropriate precision parameters. - -## Half-Precision Training - -Megatron Bridge supports half-precision FP16 and BF16 computation training via Megatron Core and the distributed optimizer. This training recipe uses half-precision in all layer computation while keeping the model states (optimizer states and master parameters) in single-precision. To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer step. - -### Using Predefined Recipes - -The simplest way to enable mixed precision is using predefined recipe names: - -```python -from megatron.bridge.training.config import ConfigContainer - -# Configure with BF16 mixed precision -config = ConfigContainer( - mixed_precision="bf16_mixed", - # ... other config parameters -) - -# Configure with FP16 mixed precision -config = ConfigContainer( - mixed_precision="fp16_mixed", - # ... other config parameters -) -``` - -### Custom Mixed Precision Configuration - -For more control, create a custom {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig`: - -```python -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -import torch - -# Custom BF16 configuration -bf16_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=True, -) - -config = ConfigContainer( - mixed_precision=bf16_config, - # ... other config parameters -) -``` - -## FP8 Training - -NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. Megatron Bridge uses the NVIDIA TransformerEngine (TE) to leverage speedups from FP8. For a more detailed overview, refer to the [TE documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html), specifically the FP8 format and recipe. - -### FP8 Configuration Parameters - -The {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` provides several FP8-specific parameters: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `fp8` | `Optional[str]` | `None` | FP8 format: `"hybrid"` (E4M3 for activations/weights, E5M2 for gradients) or `"e4m3"` | -| `fp8_recipe` | `str` | `"tensorwise"` | FP8 recipe type: `"tensorwise"`, `"delayed"`, `"blockwise"`, `"mxfp8"` (Blackwell only) | -| `first_last_layers_bf16` | `bool` | `False` | If True, retains first and last N TransformerBlocks in BF16 as opposed to FP8 | -| `num_layers_at_start_in_bf16` | `int` | `0` | Number of layers at the start of the model to keep in BF16 precision when `first_last_layers_bf16` is True | -| `num_layers_at_end_in_bf16` | `int` | `0` | Number of layers at the end of the model to keep in BF16 precision when `first_last_layers_bf16` is True | -| `fp8_margin` | `int` | `0` | Scaling factor shift by $2^{margin}$ | -| `fp8_amax_history_len` | `int` | `1` | Window size for amax history storage | -| `fp8_amax_compute_algo` | `str` | `"most_recent"` | Amax selection algorithm: `"max"` or `"most_recent"` | -| `fp8_param` | `Optional[bool]` | `None` | Store module-level parameters in FP8 | -| `fp8_param_gather` | `bool` | `False` | Enable FP8 parameter gathering | - -### FP8 Recipe Examples - -Use any of the predefined FP8 recipe names with the `mixed_precision` parameter: - -```python -# Example: BF16 with FP8 current scaling -config = ConfigContainer( - mixed_precision="bf16_with_fp8_current_scaling_mixed", - # ... other config parameters -) -``` - -## Available Mixed Precision Recipes - -Megatron Bridge provides numerous predefined mixed precision recipes for different use cases. You can use the {py:func}`~megatron.bridge.training.mixed_precision.get_mixed_precision_config` utility function to convert from a string shortname to a class instance. For the complete list of available recipes and their specific configurations, see the {py:mod}`megatron.bridge.training.mixed_precision` module. - - -### Custom FP8 Configuration - -For advanced use cases, create a custom FP8 configuration: - -```python -from megatron.bridge.training.mixed_precision import MixedPrecisionConfig -import torch - -# Custom FP8 configuration -fp8_config = MixedPrecisionConfig( - bf16=True, - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - fp8="hybrid", - fp8_recipe="tensorwise", - fp8_margin=0, - fp8_amax_history_len=1024, - fp8_amax_compute_algo="max", - fp8_param_gather=True, -) - -config = ConfigContainer( - mixed_precision=fp8_config, - # ... other config parameters -) -``` - -### Registering Custom Mixed Precision Recipes - -You can also register your own custom mixed precision configurations to work with the shortname system. Use the {py:func}`~megatron.bridge.training.mixed_precision.register` decorator on a function that returns a `MixedPrecisionConfig` object: - -```python -from megatron.bridge.training.mixed_precision import register, MixedPrecisionConfig - -@register -def my_custom_fp8_recipe() -> MixedPrecisionConfig: - """Custom FP8 recipe with specific settings for my use case.""" - return MixedPrecisionConfig( - bf16=True, - fp8="hybrid", - fp8_recipe="tensorwise", - fp8_param_gather=True, - # ... other custom settings - ) - -# Now you can use it with the utility function -config = get_mixed_precision_config("my_custom_fp8_recipe") -``` - -Common recipe categories include: -- **Half-precision recipes**: Basic BF16 and FP16 mixed precision -- **FP8 recipes**: Various FP8 scaling strategies (delayed, current, subchannel) -- **Architecture-specific recipes**: Optimized for specific GPU architectures (Hopper, Blackwell) -- **Model-specific recipes**: Tuned for particular model families - -## Configuration Synchronization - -When a mixed precision configuration is provided, it automatically synchronizes precision-related settings across the model, optimizer, and distributed data parallel (DDP) configurations. This ensures consistent precision behavior throughout the training pipeline. - -**Important**: Mixed precision settings will override any conflicting precision parameters that may have been set directly on the model, optimizer, or DDP configurations. The mixed precision configuration acts as the authoritative source for all precision-related parameters. - -For example, if you specify both: -```python -# This will be overridden -model_config.bf16 = False -optimizer_config.bf16 = False - -config = ConfigContainer( - model=model_config, - optimizer=optimizer_config, - mixed_precision="bf16_mixed", # This takes precedence during training - # ... other configs -) -``` - -The mixed precision configuration will set `bf16=True` on both the model and optimizer configs, overriding the explicitly set `False` values. This synchronization prevents configuration mismatches that could lead to training issues. - -## Performance Considerations - -- **FP8 recipes are experimental** and convergence has not been fully validated for all models -- **BF16** is generally recommended over FP16 for better numerical stability -- **FP8** provides the best performance on H100 GPUs but requires careful tuning -- **MXFP8** recipes are only supported on Blackwell architecture GPUs -- **Blockwise scaling** recipes are optimized for Hopper architecture GPUs - -## Resources - -- [Transformer Engine Documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html) -- [Intro to FP8, floating point formats, and mixed precision training](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8) -- [Performance optimizations](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html) that are natively supported in Megatron Bridge by enabling FP8 training with TE - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/performance-guide.md -```md -# Performance Tuning Guide - -Megatron-Bridge provides a wide range of features for performant and memory-efficient LLM training on GPUs, and comes pre-configured with optimal settings. However, factors such as model architecture, hyperparameters, GPU count, and GPU type can affect the available options, and additional tuning may be necessary to achieve optimal performance. This document explores the factors that affect training performance, highlights common issues, and outlines techniques for performance tuning that lead to higher MFU (Model FLOPS Utilization) and TCO. - -```{Note} -This guide makes references to several configuration settings. These settings will be referenced relative to the the config class that contains them, e.g. `OptimizerConfig.lr`. Please see for more details on configuration settings. -``` - -```{Note} -This guide references several configuration settings from `TransformerConfig`. Please apply these to the appropriate ModelProvider for your model, e.g. `GPTModelProvider`, as the `ConfigContainer` does not accept a raw `TransformerConfig`. -``` - -## Low Precision Training - -1. Expected speedup of FP8 training compared to BF16 training - - > 1. The default low-precision LLM training recipe applies FP8 computation exclusively to the linear layers within the Transformer block, typically achieving a speedup of 1.2–1.5X. - > 2. However, the actual speedup depends on the proportion of training time spent on these linear layers. For instance, smaller LLMs with a limited hidden size exhibit lower FP8 speedup, as linear layers scale with O(sequence_length × hidden_size²) complexity, whereas the other element-wise computation layers (e.g., layer norms, dropouts, RoPE, and simple math functions) scale with O(sequence_length × hidden_size), and dot-product attention scales with O(sequence_length² × hidden_size). Consequently, the contribution of linear layers to the overall training time is smaller in such models. - > 3. Different FP8 recipes use varying quantization block sizes, affecting performance. Smaller quantization blocks generally incur higher overhead in both quantization and GEMM execution. For example, MXFP8 with a 1×32 quantization block performs less efficiently than full tensor-wise FP8 scaling. - -2. Common issues of low FP8 training speedup - - > 1. Host performance boundness when LLM uses small GPU kernels (see [Lowering Host Overhead and Jitters](#lowering-overhead-jitter)). - > 2. A low proportion of linear layers in training step time that use FP8 computation. - -## Parallel Mapping Strategies - -1. Data Parallelism using Distributed Optimizer - - > 1. You should begin with data-parallel (DP) mapping. As long as the model and activation memory fit within the GPUs, data parallelism generally offers optimal performance, minimizes communication overhead, and maximizes per-GPU tensor sizes (compared to per-tensor sharding). - > - > 2. Megatron-Bridge uses the distributed optimizer as the default method for data-parallel training. It shards master parameters and optimizer states across data-parallel ranks, reducing model state memory usage without increasing communication overhead compared to traditional data-parallel training. - > - > > 1. `OptimizerConfig.use_distributed_optimizer=true` - -2. Per-tensor Sharding (Tensor-parallel or Context-parallel mappings) - - > 1. Tensor parallelism (TP) is the primary recommendation when a model exceeds GPU memory capacity under data-parallel mapping. However, since it involves higher communication overhead, the tensor-parallel size should ideally be confined to the high-bandwidth intra-node network (NVLink domain). - > - > > 1. `TransformerConfig.tensor_model_parallel_size=` - > - > 2. When the sequence length in a training run is significantly larger than the hidden size, activation memory can overflow. In such cases, context parallelism (CP) helps by sharding tensors along the sequence dimension, allowing the workload to fit within limited GPU memory and improving performance. Like tensor parallelism (TP), CP requires inter-GPU communication of activations. However, for the same tensor sizes, CP generally results in lower communication volume. - -That said, CP’s effectiveness depends on the relative sizes of the sequence length and hidden size. When the sequence length is smaller than the hidden size, CP produces narrow (or "skinny") tensor shards on each GPU. This reduces data reuse and can degrade performance. - -Additionally, because CP shards activations, it also partitions optimizer states in distributed training. As a result, optimizer state partitioning spans both the data parallel (DP) and context parallel (CP) dimensions. - -> > 1. `TransformerConfig.context_parallel_size=` -> -> 1. Performance tips: -> -> > 1. A large tensor-parallel or context-parallel size is not recommended unless the hidden size or sequence length is large enough to maintain sufficient per-GPU parallelism and avoid excessive communication overhead. For example, using a tensor-parallel size of 8 for LLAMA 3 70B could lead to low GPU utilization and make training host-performance bound. -> > 2. You can combine TP and CP to optimize performance by balancing communication overhead. For example, using TP=2 along with CP=2 can give better performance than TP=4 when the sequence size is larger than the hidden size. -> > 3. For additional tips, see [Long Sequence Training](#long-sequence-train). - -1. Pipeline Parallelism - - > 1. Pipeline parallelism (PP) is necessary when a model cannot fit within GPU memory using tensor parallelism. Also, virtual pipeline parallelism (VPP) should be used in conjunction with pipeline parallelism to reduce the overhead caused by pipeline warm-up and flush bubbles. - > - > > 1. `TransformerConfig.pipeline_model_parallel_size=` - > > 2. `TransformerConfig.virtual_pipeline_model_parallel_size=` - > - > 2. Performance tips in PP and VPP sizing: - > - > > 1. PP can also be combined with per-tensor sharding methods to mitigate the impact of sharding inefficiencies and pipeline bubbles. For instance, TP4 + PP2 may outperform TP8 when both mappings fit into memory because using a large TP reduces per-GPU tensor sizes but increases the communication cost, increasing the exposed communication. - > > 2. VPP increases inter-stage communication overhead. When a global batch contains many micro-batches, using a smaller VPP size can improve performance, as the exposed communication cost outweighs the reduction in pipeline bubbles. - > - > 3. Asymmetric Transformer layer allocation across pipeline stages - > - > > 1. An LLM with a large vocabulary size has computationally heavy embedding lookup and projection operations, leading to load imbalance across pipeline stages. To address this, Megatron-Bridge provides an option to allocate one fewer Transformer layer in the first and last pipeline stages, which handle embedding lookup and projection, to better balance workloads. - > > - > > > 1. `GPTProvider.account_for_embedding_in_pipeline_split=true` - > > > 2. `GPTProvider.account_for_loss_in_pipeline_split=true` - -2. Expert Parallelism - - > 1. Expert Parallelism (EP) is designed specifically for Mixture-of-Experts (MoE) models to efficiently distribute sparse MLP weights across multiple chips. It can be used in combination with other parallelism strategies such as Tensor Parallelism (TP), Context Parallelism (CP), Pipeline Parallelism (PP), Data Parallelism (DP), and Fully Sharded Data Parallel (FSDP). In the current design, the dense attention part and the sparse MLP part are fully decoupled in terms of their TP, CP, and DP parallelism configurations. Expert Tensor Parallelism (ETP) is introduced to specifically control the tensor parallelism for the sparse MLP part. ETP uses TP for dense layers for the ranks allocated for EP in sparse layers. On the other hand, the baseline is DEP, which folds DP in dense layers for EP in sparse layers. - > - > > 1. `TransformerConfig.expert_model_parallel_size=` - > > 2. `TransformerConfig.expert_tensor_parallel_size=` - > - > 2. Performance tips in hybrid folding options and EP sizing: - > - > > 1. Typically, EP is kept within the high-bandwidth intra-node network (NVLink domain) to minimize the communication overhead it can introduce. However, using communication overlap techniques—such as pipeline overlap or 1F1B overlap—along with PP (e.g., DualPipe) might make it possible to expand EP into the inter-node networks. - > > - > > 2. Within the sparse MLP block, DP replaces CP because it has no impact on the computation pattern based on the dispatched tokens in each EP rank. - > > - > > 3. Usually, ETP is set to 1 to avoid significant communication overhead that comes with applying TP to MLP GEMMs. - > > - > > 4. When multiple experts are placed on a single chip after applying Expert Parallelism, enabling grouped GEMM can significantly improve computation efficiency. - > > - > > > 1. `TransformerConfig.moe_grouped_gemm=True` - -3. Fully Sharded Data Parallelism - - > 1. Megatron-Bridge supports PyTorch-native FSDP. FSDP can be used in combination with per-tensor sharding methods. - > - > > 1. To use PyTorch FSDP2: - > > - > > > 1. `DistributedInitConfig.use_torch_fsdp2=True` - > - > 2. FSDP can be preferred over TP+PP+DP mappings in the following scenarios: - > - > > 1. Small models with a large sequence, thus the parameter AllGather and gradient ReduceScatter can effectively be hidden under computation and the short communication overlap causes minor interference to the computation under overlap. - > > 2. In FSDP training, activation storage remains as the main memory bottleneck because FSDP only shards model state memory, and a large per-GPU activation is needed to hide the costly FSDP communication. On GB200 GPUs, Megatron-Bridge offers an option to offload activations to the host memory via a high-speed chip-to-chip interconnect. - > > 3. Baseline training is host performance-bound, but FSDP allows for larger per-GPU tensor sizes by eliminating TP or enabling a larger micro-batch size. - - - - - - - - - - - - - - - - - - - -4. Heterogeneous Encoder Parallelism - - > 1. Encoder Pipeline Parallel - > - > > 1. Use `T5ModelProvider.encoder_pipeline_model_parallel_size`. - > > 2. In an Encoder-Decoder architecture like Multimodal models (VLMs like NeVA etc.), Encoder Pipeline Parallel can be used to add pipeline parallelism to the encoder. - > > 3. Pipeline parallelism controls the amount of pipelining in the decoder part. - > > 4. Encoder Pipeline Parallel is limited to 1 at the moment, i.e., the encoder can occupy a maximum of 1 PP stage. - > > 5. By default, Encoder Pipeline Parallel is 0 and Decoder Pipeline Parallel is 1. - > > 6. When the Encoder Pipeline Parallel size is 0, it shares the first PP stage of the Decoder. - > - > 2. Encoder Tensor Parallel - > - > > 1. Use `T5ModelProvider.encoder_tensor_model_parallel_size`. - > > 2. Since encoders tend to be much smaller than decoders, we also provide the ability to set a different amount of tensor parallelism to the encoder than the decoder. - > > 3. By default, encoder tensor parallel is set to 0, i.e., the amount of tensor parallelism in the encoder is equal to tensor parallelism in the decoder. - > > 4. To use this option, Encoder Pipeline Parallel must be greater than 0 as we need the encoder to be on its own pipeline stage. - > > 5. Encoder Tensor Parallel size is limited to be less than or equal to Tensor parallel size. - > - > 3. Total number of GPUs required when these features are used is: - > - > > 1. Data Parallel size * Context Parallel size * ((Encoder TP * Encoder PP) + (Decoder TP * Decoder PP)) - > - > 4. These features are experimental and may still have bugs. There are critical bug fixes that will be made in a future release. - -5. Parallel mapping strategies with NVL72 - - > 1. Training with only data parallelism or FSDP makes it straightforward to fully utilize the bandwidth of an NVL72 system. However, when combining multiple parallelism strategies, it's important to ensure that high-volume communicators remain confined within each NVL72 domain. For example, with TP=4, DP=16, and PP=4, the GPUs in the first TP group of DP1/PP1 spans both NVLink and network domains, causing communication performance to be bottlenecked by the slower network link. To avoid this, you may choose TP and DP sizes such that the product of TP × DP divides evenly into the NVL72 configuration. If the model-parallel size does not align naturally, padding may be required to support non-divisible group sizes. - > 2. To avoid this partitioning complexity, you can just use 64 GPUs out of the 72 GPUs. - -## Communication Overlaps and Tuning - -1. Data-parallel communication of Distributed Optimizer - - > 1. Distributed optimizer overlaps parameter AllGathers with the forward computation of the first micro-batch and gradient ReduceScatters with the backward computation of the last micro-batch. - > - > > 1. `DistributedDataParallelConfig.overlap_param_gather=true` - > > 2. `DistributedDataParallelConfig.overlap_grad_reduce=true` - > - > 2. When using the distributed optimizer with pipeline parallelism (PP) + virtual pipeline parallelism (VPP), DP communications overlap with multiple micro-batches, increasing the opportunity for effective overlap. Also, Megatron-Bridge aligns the execution timing of DP communications across pipeline-parallel ranks to synchronize the computing kernel slowdown from the overlap. - > - > > 1. `DistributedDataParallelConfig.align_param_gather=true` - > - > 3. Slow DP communication at large scaling training: - > - > > 1. Distributing optimizer states across a partial DP domain reduces communication costs over high-latency Ethernet networks. Model states remain replicated outside the distributed domain. During the final micro-batch backpropagation, gradient ReduceScatters occur within the distributed domain, followed by AllReduce in the non-distributed domain. Parameter AllGathers are performed only within the distributed domain. - > > - > > > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances= ` - > > - > > 2. A large message size for DP communication is recommended to maximize network bandwidth utilization. You can achieve this by increasing the communication bucket size. - > > - > > > 1. `DistributedDataParallelConfig.bucket_size=` - > - > 4. A common reason for DP communication overlap failure: - > - > > 1. Persistent Layer Normalization (LN) kernels from Transformer Engine use spin-waiting for all SMs in the GPU, causing the LN kernel and subsequent computation kernels to be scheduled only after DP communication. To prevent this, an appropriate SM margin should be configured using the following environment variables. - > > - > > > 1. `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>` - > > > 2. `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>` - - - - - -3. Tensor-parallel (TP) communication (with sequence parallelism) - - > 1. Megatron-Bridge currently uses the userbuffer backend in Transformer Engine for TP communication overlaps. This offers the pipelined overlap of the TP communication with dependent computation. - > - > > 1. `CommOverlapConfig.tp_comm_overlap` - > - > 2. The overlap method, resource, and precision of the TP communication overlaps are configurable, and the most performant configurations are set in the Megatron-Bridge training recipes by default. Also, you can set a custom TP communication overlap configuration via the below interface following the structure of TransformerLayerTPOverlapCfg class. - > - > > 1. `CommOverlapConfig.tp_comm_overlap_cfg=` - > - > 3. TP communication overlap setting tips - > - > > 1. Balancing the number of SMs between communication and GEMM - > > - > > > 1. For AllGather/ReduceScatter bulk and ReduceScatter pipelined overlap, you can adjust the number of SMs to balance communication and GEMM execution. Allocating too many SMs to communication may degrade GEMM performance, while too few may expose communication overhead. The default SM allocation for communication is 16, but you can fine-tune it based on profiling results. - > > > 2. `TPOverlapCfg.num_sm=` - > > - > > 2. CGA sizing to improve SM utilization - > > - > > > 1. The CGA size can be set between 1 and 4, but it should not exceed the number of SMs allocated for communication. We recommend using CGA ≤ 2 to prevent potential SM rasterization that could impact GEMM performance. - > > > 2. `TPOverlapCfg.cga_size=` - > > - > > 3. Use 4× splits for ReduceScatter and GEMM overlap to optimize the balance between GEMM efficiency and communication exposure. - > > - > > > 1. In GEMM-then-ReduceScatter pipeline overlap, a 1× ReduceScatter chunk remains exposed. A small split size increases communication exposure, while a large split size may degrade performance due to aggregated GEMM wave quantization. We find that num_splits = 4 generally provides the best performance. - > > > 2. `TPOverlapCfg.num_split=` - > - > 4. Common reason for TP comm overlap failure at Hopper - > - > > 1. At H100 GPU, an environment variable `CUDA_DEVICE_MAX_CONNECTIONS=1` should be set. Otherwise, TP communication kernels can be scheduled at the end of GEMM to overlap with. - > > 2. Pipelined TP communication overlap is used by a static userbuffer registered upon model initialization. Therefore, it doesn't support activation tensors dynamically changing between steps or between Transformer layers. - -4. Context-parallel (CP) communication - - > 1. CP communication is configurable via "cp_comm_type", which can be "p2p", "all_gather", "a2a", or "a2a+p2p". Communications of "p2p" are implemented as ring-exchange send/receive operations, and they are hard-coded to overlap with the attention compute of sequence chunks. See [Long Sequence Training](#long-sequence-train) for more details. - -5. Expert-parallel communication - - > 1. To hide the A2A/AG communication introduced by EP, pipeline split overlap or 1F1B overlap alongside Pipeline Parallelism could be possible. It will be added to Megatron-Bridge in future releases. - -6. Pipeline-parallel (PP) send/receive communication - - > 1. PP send/recv in steady 1F1B states are set to be overlapped with computes by default. - > 2. The PP send/recv in warmup and flush are exposed by default. - -(comm-data-types)= -## Communication Data Types - -1. FP8 data-parallel parameter AllGather in Distributed Optimizer and FSDP - - > 1. Megatron-Bridge supports FP8 parameter AllGather for per-tensor FP8 scaling recipes. This operation is lossless, enhancing performance while reducing memory usage. - > - > > 1. `MixedPrecisionConfig.fp8_param=true` - -2. BF16 (instead of FP32) data-parallel reduction in Distributed Optimizer and FSDP - - > 1. We have validated that BF16 reduction is numerically safe across numerous model training runs. However, BF16 reduction with a large data-parallel size (e.g., DP ≥ 128), especially the Ring reduction algorithm—which accumulates copies sequentially—may impact numerical stability. When using SHARP with NVIDIA InfiniBand, BF16 reduction is more robust, as it performs binary additions with higher precision for intermediate partial reductions. - > - > > 1. `DistributedDataParallelConfig.grad_reduce_in_fp32=false` - -3. FP8 tensor-parallel ReduceScatter - - > 1. When communication latency exceeds GEMM execution time, using FP8 input ReduceScatter can better hide communication overhead. This approach has low numerical impact, as the GEMM output must be cast to FP8 and then converted back to high precision during reduction. - > - > > 1. `TPOverlapCfg.fp8_buf=true` - -4. FP8 A2A Dispatch for expert parallel communication - - > 1. Megatron-Bridge is working on supporting FP8 A2A dispatch (before expert FC1), but still keeps BF16 A2A combine (after expert FC2). - -## Performance at Scale - -1. Scaling a training job is typically achieved by increasing the size of the data-parallel domain. In large-scale training, this often results in a small number of micro-batches per global batch—or even a single micro-batch—causing most computations to overlap with data-parallel communication. To maintain high performance in such scenarios, you should focus on minimizing the overhead of data-parallel communication and reducing host-driven inter-GPU jitter. - -2. You can lower the overhead of data-parallel communication by (1) reducing the communication precision e.g., BF16 for gradient reduction and FP8 parameter gathering, (2) improving the efficiency of communication by increasing the data-parallel communication message size or using the hierarchical data-parallel reduction, or (3) using multi-cast and switch reduction with SHARP in case of InfiniBand network. - - > 1. Using BF16 gradient reduction and FP8 parameter gather are described in [Communication Data Types](#comm-data-types) - > - > 2. For non-pipeline-parallel training, the data-parallel communication bucket size can be adjusted using the knobs below. In pipeline-parallel training, however, the bucket size is fixed and determined by the number of parameters assigned to each virtual pipeline rank. - > - > > 1. `DistributedDataParallelConfig.bucket_size=` - > - > 3. Setting the knob below splits the data-parallel domain of the distributed optimizer into a sharding domain and a replication domain. Gradient reduction then occurs in two stages—one within each domain—avoiding the use of a single large flat ring for collective operations that have high latency. - > - > > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances=` - -3. Ideas to reduce the host-driven inter-GPU jitters are discussed in [Lowering Host Overhead and Jitters](#lowering-overhead-jitter). - -(lowering-overhead-jitter)= -## Lowering Host Overhead and Jitters - -1. Common observation associated with host overhead - - > 1. Significantly low GPU FLOPS. - > 2. Small performance gain of low-precision (FP8) training. - > 3. Small LLMs with small hidden size or sequence length or fine-tuning without sequence packing - > 4. High multi-GPU communication variation. - -2. Increasing micro-batch size and reduce per-tensor sharding - - > 1. The most common way to increase per-GPU tensor size is by increasing the micro-batch size or minimizing unnecessary per-tensor sharding (e.g., TP or CP) when GPU memory permits. - -3. Manual garbage collection to align the host interruption across GPUs - - > 1. Megatron-Bridge manually aligns the timing of garbage collection across GPUs that significantly mitigate the host overhead compared to the baseline automatic garbage collection. - > - > > 1. `TrainingConfig.manual_gc_interval=` - -4. CUDA graph to eliminate repeated static host code execution - - > 1. Megatron-Bridge supports graph capture, significantly reducing host overhead. CUDA Graph is applicable only to LLMs with a static tensor shape across training steps. For example, it supports fixed-size packed sequences but does not handle sequences with varying lengths at each step. Also, MoE models with token-dropless propagation have limited CUDA graph support, restricted to the dense modules only. - > 2. CUDA graph requires additional memory for static buffer management, typically adding a few gigabytes for static buffers, while models with PP size > 1 may consume over 10GB. We are actively working to reduce this memory overhead. - > 3. See [CUDA Graphs](training/cuda-graphs.md) for configuration details (`cuda_graph_impl`, `cuda_graph_scope`). - -5. Bind CPU memory for GPU processes - - > 1. Binding CPU cores to GPU processes helps mitigate long latency issues and ensures minimal variation in GPU queuing latency across GPUs. This optimization significantly impacts, particularly when the communication domain size is large. - > 2. Example command line for a X86-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/4)) --membind=$((SLURM_LOCALID/4)) ` - > 3. Example command line for a Grace-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/2)) --membind=$((SLURM_LOCALID/2)) ` - -(reducing-memory-overflow)= -## Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency - -1. Activation recomputation - - > 1. Megatron-Bridge LLMs default to dot-product attention-only recomputation using Flash Attention, efficiently regenerating large intermediate activations from the attention operation with minimal computational overhead. - > - > 2. Megatron-Bridge also supports recomputing the full intermediate activations of a Transformer block, significantly reducing activation memory usage at the cost of approximately 30% additional computation. The number of Transformer blocks to recompute can be adjusted using a configurable setting. - > - > > 1. `TransformerConfig.recompute_granuality=full` - > > 2. `TransformerConfig.recompute_method=block` - > > 3. `TransformerConfig.recompute_num_layers=` - -2. Activation offloading to host memory - - > 1. Megatron-Bridge supports offloading activation memory to host memory, essential for training tasks constrained by activation memory. This is particularly useful for scenarios like (1) FSDP, where model state memory is minimized through sharding but activation memory remains high, (2) LoRA, which has frozen parameters but significant activation memory demands, and (3) the training with a large sequence length. The efficiency of activation offloading depends on both the interconnect bandwidth between the GPU and host and the host memory bandwidth. From this perspective, Grace-based systems like the GB200 enhance offloading performance by optimizing these bandwidths. - > - > 2. The following knobs should be configured to enable offloading and specify the number of Transformer layers to offload to host memory. The maximum number of layers that can be offloaded depends on host memory capacity, which may be lower when the CPU is shared among multiple GPUs. - > - > > 1. `TransformerConfig.cpu_offloading=True` - > > 2. `TransformerConfig.cpu_offloading_weights=False` - > > 3. `TransformerConfig.cpu_offloading_num_layers= ` - > - > 3. Environment variable settings to avoid resource conflict between CPU memory offloading and network communication - > - > > 1. `NCCL_NET_GDR_LEVEL=PHB # NCCL <=2.25` - > > 2. `NCCL_NET_GDR_C2C=1 # NCCL >=2.26` - > - > 4. Optimization tips - > - > > 1. Given the ratio between activation volume and computational operations, offloading all layer activations naively can become a performance bottleneck. Optimizing performance requires tuning the number of layers to offload while balancing it with recomputation. - -3. Weight memory-optimized BF16 training - - > 1. In BF16 training, Megatron-Bridge optimizes memory usage by storing only the BF16 remainder of the master weight copies for the next optimizer update. This is possible because BF16 data can be represented using a subset of FP32 bits, allowing Megatron-Bridge to avoid redundant storage of the FP32 portion used for BF16 representation. This is default enabled when using precision-aware optimizer in Megatron Core. - > - > > 1. `OptimizerConfig.use_precision_aware_optimizer=True` - -4. Common memory usage hikes from environment variable setting - - > 1. The below environment variables will (1) avoid preserving the buffers for NCCL communication and (2) disable NVLSharp when not used. Both these options lower the GPU memory usage. - > - > > 1. `TORCH_NCCL_AVOID_RECORD_STREAMS=1` - > > 2. `NCCL_NVLS_ENABLE=0` - > - > 2. While not enabled by default, you can further reduce memory usage caused by segmentation penalties by setting the env var shown below. - > - > > 1. `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` - -5. Keep parameters in FP8 at FP8 training - - > 1. In FP8 training, after optimizer step execution, we can keep the parameters in FP8. Compared to the baseline that keeps the intermediate weight values in BF16, FP8 parameters lower memory usage and improve communication performance. The below knob enables keeping the parameters in FP8. - > - > > 1. `MixedPrecisionConfig.fp8_param_gather=True` - -## Operator Fusion - -1. You can control specific fusion behaviors using the following configuration knobs: - - > 1. `TransformerConfig.masked_softmax_fusion=true` - > 2. `GPTProvider.cross_entropy_loss_fusion=true` - > 3. `GPTProvider.gradient_accumulation_fusion=true` - > 4. `TransformerConfig.bias_activation_fusion=true` - > 5. `TransformerConfig.bias_dropout_fusion=true` - > 6. `TransformerConfig.apply_rope_fusion=true` - -2. Megatron-Bridge offers different Flash Attention options, which can be chosen through the model config: - - > 1. Let Transformer Engine decide (default): `TransformerConfig.attention_backend=AttnBackend.auto` - > 2. FlashAttention2: `TransformerConfig.attention_backend=AttnBackend.flash` - > 3. cuDNN fused attention: `TransformerConfig.attention_backend=AttnBackend.fused` - -(long-sequence-train)= -## Long Sequence Training - -1. Problem of long sequence training - - > 1. Training with long sequence length can lead to memory overflow due to the huge memory cost of activations. The problem could be solved by recomputing activations in backward, but it can impose up to ~30% overheads in each training step. Context parallelism is a better solution which splits the sequence dimension across multiple GPUs, so that each GPU only computes and saves activations of a sequence chunk. In this way, memory overflow is addressed without introducing any redundant compute. - -2. CP to shard activation (knob) - - > 1. `TransformerConfig.context_parallel_size=` - > - > > 1. Both TP and CP can reduce activation memory overheads. It's not wise to be biased to either of them. Communications of TP and CP are overlapped by GEMM and Attention respectively. Blindly enlarging their sizes can make some communications hard to overlap. It's recommended to sweep a combination of TP+CP configs. The optimal config is expected to make full use of all related compute and do best overlapping, thereby achieving best end-to-end performance. - > - > 2. `TransformerConfig.cp_comm_type= or ` - > - > > 1. Megatron-Core provides multiple implementation variants of CP and allows you to make choices based on your specific use cases by configuring "cp_comm_type". The configuration value can be `p2p`, `all_gather`, `a2a`, or `a2a+p2p`. These communication types are compatible with each other, so they can be flexibly interleaved between transformer layers. You only need to provide a list, where each element corresponds to a layer. - > > 2. `p2p`: exchanges KV sequence chunks in ring-topology. The P2P communications can be fully overlapped. - > > 3. `all_gather`: inserts an all-gather before attention to get a full sequence of KV. The all-gather is exposed, but it should not impose big overheads if GQA/MQA are used, as they have very few KV heads. - > > 4. `a2a`: is an implementation of DeepSpeed Ulysses. A2A communications are added before and after the attention module to gather full sequence length and further scatter heads in CP domain. A2A cannot be overlapped. - > > 5. `a2a+p2p`: is a middle ground between `a2a` and `p2p`. This is useful for cases of big CP sizes, where each sequence chunk is too short to overlap P2P communications. It first does A2A in partial CP groups to gather relatively longer sequence chunks, then applies P2P implementation to the gathered chunks. It also can be helpful for hierarchical CP communications, for example A2A and P2P happen in NVLink and IBLink domains respectively. - > > 6. With small and medium CP size, `p2p` is the recommended configuration because communications can be fully overlapped; "all_gather" also should work fine with GQA/MQA. As for strongly-scaling a sequence length with big CP sizes, the short chunk length can barely overlap the `p2p` communications, so `a2a+p2p` ought to be the preferred choice. `a2a` could be adopted in some cases for its simplicity. However, CP size can be restricted with "a2a" because it requires the number of attention heads to be divisible by CP size. Restricted CP size will finally limit the sequence length that can be run. - -3. Activation recomputation (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow)) - -4. Activation offloading to host memory (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow)) - -## Sequence Packing for Performant Fine-Tuning - -1. Dataset preparation - - > 1. Fine-tuning datasets with shorter sequences of variable length can be packed into longer sequences, up to a set maximum length, for best efficiency. - -2. To use this feature, the microbatch size must be set to 1. In place of increasing the micro batch size, the maximum sequence length can be increased, which will effectively increase the number of individual sequences per packed sequence. - -3. Enabled with: - - > 1. `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size=` - > 2. `TrainingConfig.micro_batch_size=1` - -4. Performance benefits also include: - - > 1. Inconsistent lengths between sequences in the fine-tuning dataset would reduce the computation efficiency. With a micro-batch size over 1, all sequences must be padded with empty tokens to the length of the longest one in the micro-batch. Similarly, some optimizations like CUDA graphs require uniform sequence lengths between micro-batches. Packed sequences are arranged so that the total number of tokens per packed sequence is as close to the maximum length as possible, making most processed tokens useful. - > 2. Likewise, when using data parallel, variance in time needed to process different batches can result in all batches needing to wait for the longest to finish-- and this variance is reduced with packed sequence. - -## GPU Core Clock Optimization - -1. Increase the clock ratio of GPU core over off-chip memory system - - > 1. NVIDIA GPUs support a CPU core clock boost mode, which increases the core clock rate by reducing the off-chip memory clock rate. This is particularly beneficial for LLMs, which are typically compute throughput-bound. - > - > > 1. `sudo nvidia-smi boost-slider --vboost 1 ` - -## Profiling Options for Analysis-based Performance Tuning - -1. Nsight system profile - - > 1. Megatron-Bridge provides an interface to enable the NVIDIA Nsight Systems profiler, which displays the GPU execution trace of all CUDA streams. You can check whether communication kernels overlap with computation kernels and adjust resource allocation to balance communication and computation. The Nsight Systems profile can be enabled using ProfilingConfig, as shown below. - > 2. `ProfilingConfig(use_nsys_profiler=True, profile_start_step=, profile_end_step=, profile_ranks=<[0,...]>)` - -2. Memory snapshot - - > 1. Megatron-Bridge provides an interface to extract the memory snapshot that shows the memory allocation bytes, the allocation lifespan, and the function call stack. Extracting the memory snapshot can be enabled by ProfilingConfig as shown below. - > 2. `ProfilingConfig(record_memory_history=True, memory_snapshot_path=)` - -## DeepEP: Common Issues and Solutions - -DeepEP is a communication library optimized for Mixture-of-Experts (MoE) all-to-all operations. When using DeepEP for cross-node Expert Parallelism (EP), there are several common issues related to network transport and GPU-NIC affinity that can significantly impact performance. - -> Note: DeepEP is best optimized for NVL8 systems such as the DGX-B200 NVL8 or DGX-H200 NVL8. For GB200 NVL72 rack-scale systems, where 72 GPUs are interconnected within the same NVLINK domain, we recommend using [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) instead of DeepEP. HybridEP is maintained by NVIDIA and is specifically optimized for NVL72 rack scale systems. It is also integrated into the Megatron-core [fused all-to-all module](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.transformer.moe.fused_a2a.html) as an alternative backend under the `flex` token dispatcher. -> -> Learn more about GB200 MoE training best practices [here](https://github.com/NVIDIA/Megatron-LM/blob/dev/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md). - -### 1. Why is my DeepEP not working - -1. What is IBGDA and why is it a problem - - DeepEP achieves optimal cross-node communication performance using InfiniBand GPU Direct Async (IBGDA), which is supported by ConnectX NICs in both InfiniBand and RoCEv2 modes. However, IBGDA is not always enabled by default—it often requires cluster administrators to actively configure the system and enable GPU Direct RDMA support in the InfiniBand (or RoCEv2) fabric. If this configuration step is skipped or unsupported in the cluster environment, IBGDA may be unavailable, which can prevent DeepEP inter-node EP capability from functioning. - -1. Network Transport: IBGDA vs. IBRC - - > 1. IBGDA (InfiniBand GPU Direct Async) requires cluster administrators to enable GPU Direct RDMA and configure the InfiniBand subsystem. Many clusters do not have IBGDA enabled by default. - > 2. The official DeepEP main branch has removed support for IBRC (InfiniBand Reliable Connection), which previously served as a fallback mechanism. With IBRC, a CPU proxy thread will assist in processing the EP communication, which might have performance degradation compared to IBGDA, but we find such performance degradation doesn't overshadow the benefit of enabling wideEP in production training. - -2. Solution: NVSHMEM 3.5 with Automatic Transport Fallback - - > 1. NVSHMEM 3.5 introduces improved auto-fallback support for cross-node communication under various network configurations. It can automatically select the best available transport (IBGDA, IBRC, or other supported mechanisms) based on cluster capabilities. - > 2. To benefit from NVSHMEM’s auto-fallback in DeepEP: - > - Download the [official NVSHMEM 3.5.19-1 release](https://github.com/NVIDIA/nvshmem/releases/tag/v3.5.19-1). You can also choose to compile it from source in your container environment; we provide such examples later in this guide. - > - Switch to the [DeepEP branch with native NVSHMEM API integration](https://github.com/seth-howell/DeepEP/tree/nvshmem_native_apis). This branch enables automatic use of NVSHMEM’s fallback mechanisms without requiring any manual code modifications. - -### 2. GPU-NIC Affinity and Bandwidth Contention - -A common cause of poor DeepEP performance is incorrect GPU-to-NIC (Network Interface Card) affinity, where multiple GPUs compete for bandwidth on a single NIC. As noted in [DeepEP PR #466](https://github.com/deepseek-ai/DeepEP/pull/466), cross-node EP performance may degrade if multiple GPUs use the same NIC, due to certain GPU-NIC affinity in some clusters. This PR provides a solution by supporting the environment variable `DEEP_EP_DEVICE_TO_HCA_MAPPING` to specify GPU-to-NIC mappings so that each GPU is automatically bound to the optimal NIC for maximum DeepEP throughput. - -With this PR's solution, we need the following environment variables to map GPUs to NICs correctly. First, you need to find out the names of the NICs by running `ibstat`. In our example, we found the following for one RoCEv2 DGX-B200 cluster: -``` -> ibstat | grep ^CA -CA 'rocep145s0' -CA 'rocep146s0' -CA 'rocep152s0' -CA 'rocep153s0' -CA 'rocep198s0' -CA 'rocep199s0' -CA 'rocep205s0' -CA 'rocep206s0' -``` - -Use the following environment variables to map GPUs to NICs. Note that `0:rocep145s0:1` is formatted as `::` so that each GPU will only be mapped to one dedicated NIC. -```bash -export NVSHMEM_ENABLE_NIC_PE_MAPPING=1 -export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1" -``` - -### 3. Build DeepEP - -In this section, we provide a reference Dockerfile that shows how to build NVSHMEM 3.5 and the customized DeepEP into your container environment. - -Note that the following example is provided for DGX-B200 NVL8 systems, but similar ideas apply to Hopper generation as well—just change the Dockerfile accordingly. For example, you just need to change the compile target for SM90. - -Key points: - -- NVSHMEM source: https://github.com/NVIDIA/nvshmem/tree/v3.5.19-1 -- DeepEP branch that we cherry-picked with all the fixes above: https://github.com/zhongbozhu/DeepEP/tree/nvshmem_deepep_gcp -- Example training container template for DGX-B200: https://github.com/yanring/Megatron-MoE-ModelZoo/blob/main/dockers/B200.Dockerfile - -**Dockerfile** -```bash -FROM nvcr.io/nvidia/pytorch:25.11-py3 as base - -# Other dependencie you may want -... - -# Dependency of IBGDA -RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so - -# Clone DeepEP customized version -WORKDIR /home/dpsk_a2a -RUN git clone https://github.com/zhongbozhu/DeepEP.git ./deepep -RUN cd ./deepep && git checkout nvshmem_deepep_gcp && cd /home/dpsk_a2a - -# Clone NVSHMEM 3.5 https://github.com/NVIDIA/nvshmem -RUN git clone --branch v3.5.19-1 https://github.com/NVIDIA/nvshmem.git ./deepep-nvshmem -RUN cd ./deepep-nvshmem && git checkout v3.5.19-1 && cd /home/dpsk_a2a - -# Build nvshmem from source -# You can also download the pre-built binary, and skip the following -RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y \ - clang \ - llvm-dev \ - libclang-dev && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /home/dpsk_a2a/deepep-nvshmem -RUN mkdir -p build && mkdir -p install && \ - cmake -S . -B build \ - -DCMAKE_INSTALL_PREFIX=/home/dpsk_a2a/deepep-nvshmem/install \ - -DCUDA_HOME=/usr/local/cuda \ - -DMPI_HOME=/opt/hpcx/ompi \ - -DMPI_C_COMPILER=/opt/hpcx/ompi/bin/mpicc \ - -DMPI_CXX_COMPILER=/opt/hpcx/ompi/bin/mpicxx \ - -DNVSHMEM_MPI_SUPPORT=OFF \ - -DNVSHMEM_IBRC_SUPPORT=ON \ - -DNVSHMEM_IBGDA_SUPPORT=ON \ - -DNVSHMEM_IBDEVX_SUPPORT=OFF \ - -DNVSHMEM_UCX_SUPPORT=OFF \ - -DNVSHMEM_SHMEM_SUPPORT=OFF \ - -DNVSHMEM_PMIX_SUPPORT=OFF \ - -DNVSHMEM_USE_NCCL=OFF \ - -DNVSHMEM_USE_GDRCOPY=ON \ - -DGDRCOPY_HOME=/usr \ - -DNVSHMEM_USE_MLX5DV=ON \ - -DNVSHMEM_BUILD_TESTS=ON \ - -DNVSHMEM_BUILD_EXAMPLES=ON \ - -DNVSHMEM_BUILD_PYTHON_LIB=OFF \ - -DNVSHMEM_BUILD_BITCODE_LIBRARY=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CUDA_ARCHITECTURES="100" && \ - cmake --build build -j && \ - cmake --install build - -ENV NVSHMEM_DIR=/home/dpsk_a2a/deepep-nvshmem/install -ENV LD_LIBRARY_PATH=${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH -ENV PATH=${NVSHMEM_DIR}/bin:$PATH - -## Build deepep -WORKDIR /home/dpsk_a2a/deepep -ENV TORCH_CUDA_ARCH_LIST="10.0" -ENV PIP_NO_BUILD_ISOLATION=1 -ENV CPATH=${CUDA_HOME}/include/cccl:$CPATH -RUN pip install --no-build-isolation . - -``` - -DeepEP provides `test_internode.py` to test and benchmark cross-node EP communication. In our experiment, when using 4 nodes of DGX-B200 (i.e., EP32), the achieved throughput for cross-EP is about 50 GB/s with IBRC. We provide an example SLURM script below for running such a test with DeepEP. - -In another experiment on the same cluster, with IBGDA enabled by the cluster admin, we observed approximately 10% higher inter-node performance—roughly 55 GB/s. To enable IBGDA, you need to set the environment variable `export NVSHMEM_IB_ENABLE_IBGDA=true`; there is no need to change the software version or container, because with the software provided above, both modes will work. - -```bash -srun --account= -N 4 -p batch --time 30 \ - --ntasks-per-node=1 --gpus-per-node=8 \ - --no-container-mount-home --container-mounts "/lustre:/lustre" \ - --container-image \ - --mpi=none --export=ALL \ - bash -lc ' -set -eo pipefail - -# Env Var for GPU-NIC mapping -export NVSHMEM_ENABLE_NIC_PE_MAPPING=1 -export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1" - - -# 1) Expand SLURM_JOB_NODELIST and grab the first hostname -headnode=$(python - </dev/null 2>&1; then - master_ip=$(getent ahostsv4 "$headnode" | awk "{print \$1; exit}") -else - master_ip="" -fi -MASTER_ADDR="${master_ip:-$headnode}" - -# 3) Export rendezvous env that matches test_internode.py expectations -export MASTER_ADDR -export MASTER_PORT=${MASTER_PORT:-29500} -export WORLD_SIZE=${SLURM_NNODES:-2} # number of nodes -export RANK=${SLURM_NODEID:-0} # 0..N-1 per node - -export OMP_NUM_THREADS=1 -python -u /home/dpsk_a2a/deepep/tests/test_internode.py -' - -``` - - - - - - - - - - -## Index - List of Tuning Knobs - -- `CommOverlapConfig.tp_comm_overlap` -- `CommOverlapConfig.tp_comm_overlap_cfg` -- `CUDA_DEVICE_MAX_CONNECTIONS` -- `TrainingConfig.manual_gc_interval` -- `MixedPrecisionConfig.fp8_param` -- `ProfilingConfig` -- `NCCL_NET_GDR_C2C` -- `NCCL_NET_GDR_LEVEL` -- `NCCL_NVLS_ENABLE` -- `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives` -- `TransformerConfig.attention_backend` -- `AttnBackend` -- `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives` -- `PYTORCH_CUDA_ALLOC_CONF` -- `TrainingConfig.micro_batch_size` -- `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size` -- `TransformerConfig.apply_rope_fusion` -- `TransformerConfig.bias_activation_fusion` -- `TransformerConfig.bias_dropout_fusion` -- `TransformerConfig.cp_comm_type` -- `TransformerConfig.cpu_offloading` -- `TransformerConfig.cpu_offloading_num_layers` -- `TransformerConfig.cpu_offloading_weights` -- `GPTProvider.cross_entropy_loss_fusion` -- `TransformerConfig.cuda_graph_impl` / `cuda_graph_scope` (see [CUDA Graphs](training/cuda-graphs.md)) -- `MixedPrecisionConfig.fp8_param_gather` -- `GPTProvider.gradient_accumulation_fusion` -- `TransformerConfig.masked_softmax_fusion` -- `TransformerConfig.recompute_granuality` -- `TransformerConfig.recompute_method` -- `TransformerConfig.recompute_num_layers` -- `OptimizerConfig.use_precision_aware_optimizer` -- `GPTProvider.account_for_embedding_in_pipeline_split` -- `GPTProvider.account_for_loss_in_pipeline_split` -- `TransformerConfig.context_parallel_size` -- `DistributedDataParallelConfig.align_param_gather` -- `DistributedDataParallelConfig.bucket_size` -- `DistributedDataParallelConfig.bucket_size` -- `DistributedDataParallelConfig.data_parallel_sharding_strategy` -- `DistributedDataParallelConfig.grad_reduce_in_fp32` -- `DistributedDataParallelConfig.num_distributed_optimizer_instances` -- `DistributedDataParallelConfig.overlap_grad_reduce` -- `DistributedDataParallelConfig.overlap_param_gather` -- `T5ModelProvider.encoder_pipeline_model_parallel_size` -- `T5ModelProvider.encoder_tensor_model_parallel_size` -- `TransformerConfig.expert_model_parallel_size=` -- `TransformerConfig.expert_tensor_parallel_size=` -- `TransformerConfig.moe_grouped_gemm` -- `DistributedInitConfig.use_torch_fsdp2` -- `TransformerConfig.pipeline_model_parallel_size` -- `TransformerConfig.tensor_model_parallel_size` -- `TransformerConfig.virtual_pipeline_model_parallel_size` -- `OptimizerConfig.use_distributed_optimizer` -- `TORCH_NCCL_AVOID_RECORD_STREAMS` -- `TPOverlapCfg.cga_size` -- `TPOverlapCfg.fp8_buf` -- `TPOverlapCfg.num_sm` -- `TPOverlapCfg.num_split` - - - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/cpu-offloading.md -```md -# CPU Offloading - -## Overview - -CPU Offloading in Megatron Bridge is a feature that reduces the peak memory usage of the GPU by offloading activations and inactive weights to CPU storage. Megatron Bridge supports offloading at the transformer layer level, allowing users to specify the number of transformer layers in their language model that require CPU offloading. During the forward pass, Megatron Bridge offloads activations at the optimal time and reloads them as needed during the backward pass. - -## Features - -- Supports training models with long sequence lengths by managing activation memory efficiently -- Enables high batch sizes per GPU by offloading activation memory -- Overlaps computation with data transfers (Host2Device and Device2Host) during offloading and reloading - -## Configuration - -CPU offloading is configured through the model provider parameters: - -```python -from megatron.bridge.models import GPTModelProvider - -# Basic CPU offloading configuration -model_config = GPTModelProvider( - # Model architecture - hidden_size=4096, - num_layers=32, - - # CPU offloading settings - cpu_offloading=True, # Enable CPU offloading - cpu_offloading_num_layers=16, # Number of layers to offload (0 to num_layers-1) - cpu_offloading_activations=True, # Offload activations - cpu_offloading_weights=True, # Offload weights - - # ... other model parameters -) -``` - -### Configuration Parameters - -- **`cpu_offloading`**: Set to `True` to enable CPU offloading -- **`cpu_offloading_num_layers`**: Number of transformer layers to offload (value between 0 and total number of layers minus one) -- **`cpu_offloading_activations`**: Whether to offload activations to CPU memory (default: `True`) -- **`cpu_offloading_weights`**: Whether to offload inactive weights to CPU memory (default: `False`) -- **`cpu_offloading_double_buffering`**: Enable double buffering across layers while reloading activations from CPU (default: `False`) - -### Offloading Strategies - -You can configure different combinations of offloading based on your memory requirements: - -#### Activations Only -```python -model_config = GPTModelProvider( - cpu_offloading=True, - cpu_offloading_num_layers=8, - cpu_offloading_activations=True, # Offload activations - cpu_offloading_weights=False, # Keep weights on GPU -) -``` - -#### Weights Only -```python -model_config = GPTModelProvider( - cpu_offloading=True, - cpu_offloading_num_layers=8, - cpu_offloading_activations=False, # Keep activations on GPU - cpu_offloading_weights=True, # Offload weights -) -``` - -#### Both Activations and Weights -```python -model_config = GPTModelProvider( - cpu_offloading=True, - cpu_offloading_num_layers=8, - cpu_offloading_activations=True, # Offload activations - cpu_offloading_weights=True, # Offload weights -) -``` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/models/README.md -```md -# Supported Models - -This directory contains documentation for all models supported by Megatron Bridge, including Large Language Models (LLMs) and Vision Language Models (VLMs). Each model documentation includes architecture details, conversion examples for Hugging Face ↔ Megatron Bridge, and links to training recipes. - -## Model Categories - -Megatron Bridge supports two main categories of models: - -### 🔤 Large Language Models (LLMs) - -Text-only models for language understanding and generation tasks. - -| Category | Model Count | Documentation | -|----------|-------------|---------------| -| **Large Language Models** | 13 models | [LLM Documentation](llm/README.md) | - -**Supported LLM Families:** - -- DeepSeek (V2, V3) -- Gemma (2, 3) -- GLM-4.5 -- GPT-OSS -- LLaMA (3, Nemotron) -- Mistral -- Moonlight -- Nemotron-H -- OLMoE -- Qwen (2, 2.5, 3, 3 MoE, 3-Next) - -### 🖼️ Vision Language Models (VLMs) - -Multimodal models that combine vision and language capabilities. - -| Category | Model Count | Documentation | -|----------|-------------|---------------| -| **Vision Language Models** | 4 models | [VLM Documentation](vlm/README.md) | - -**Supported VLM Families:** - -- Gemma 3 VL -- Nemotron Nano V2 VL -- Qwen (2.5 VL, 3 VL) - ---- - -## Quick Navigation - -### I want to - -**🔍 Find a specific LLM model** -→ Browse [Large Language Models](llm/README.md) documentation - -**🖼️ Find a specific VLM model** -→ Browse [Vision Language Models](vlm/README.md) documentation - -**🔄 Convert models between formats** -→ See [Bridge Guide](../bridge-guide.md) for Hugging Face ↔ Megatron conversion - -**🚀 Get started with training** -→ See [Training Documentation](../training/README.md) for training guides - -**📚 Understand model architectures** -→ Each model page documents architecture-specific features and configurations - -**🔧 Add support for a new model** -→ Refer to [Adding New Models](../adding-new-models.md) - -**📊 Use training recipes** -→ Read [Recipe Usage](../recipe-usage.md) for pre-configured training recipes - ---- - -## Model Documentation Structure - -Each model documentation page typically includes: - -1. **Model Overview** - Architecture and key features -2. **Available Variants** - Supported model sizes and configurations -3. **Conversion Examples** - Converting between Hugging Face and Megatron formats -4. **Training Recipes** - Links to training configurations and examples -5. **Architecture Details** - Model-specific features and configurations - ---- - -## Common Tasks by Model Type - -### For LLM Models - -**Training:** - -- Pretraining on large corpora -- Supervised fine-tuning (SFT) -- Parameter-efficient fine-tuning (PEFT/LoRA) -- Preference optimization (DPO) - -**Deployment:** - -- Export to Hugging Face format -- Integration with inference engines -- Model serving and deployment - -**Use Cases:** - -- Text generation -- Question answering -- Conversational AI -- Code generation - -### For VLM Models - -**Training:** - -- Multimodal pretraining -- Vision-language alignment -- Fine-tuning on visual tasks - -**Deployment:** - -- Export to Hugging Face format -- Multimodal inference - -**Use Cases:** - -- Image captioning -- Visual question answering -- Document understanding -- Multimodal reasoning - ---- - -## Related Documentation - -### Getting Started - -- **[Main Documentation](../README.md)** - Return to main documentation -- **[Bridge Guide](../bridge-guide.md)** - Hugging Face ↔ Megatron conversion -- **[Bridge Tech Details](../bridge-tech-details.md)** - Technical details of the bridge system - -### Training Resources - -- **[Training Documentation](../training/README.md)** - Comprehensive training guides -- **[Configuration Container](../training/config-container-overview.md)** - Training configuration -- **[Parallelisms Guide](../parallelisms.md)** - Data and model parallelism strategies -- **[Performance Guide](../performance-guide.md)** - Performance optimization - -### Advanced Topics - -- **[Adding New Models](../adding-new-models.md)** - Extending model support -- **[Recipe Usage](../recipe-usage.md)** - Using pre-configured training recipes -- **[Bridge RL Integration](../bridge-rl-integration.md)** - Reinforcement learning integration -- **[PEFT](../training/peft.md)** - Parameter-efficient fine-tuning - ---- - -## Model Support Overview - -### By Architecture Type - -**Decoder-Only (Autoregressive):** - -- GPT-style models (GPT-OSS) -- LLaMA family (LLaMA 3, LLaMA Nemotron) -- Qwen family (Qwen 2, 2.5, 3, 3-Next) -- Gemma family (Gemma 2, 3) -- DeepSeek family (DeepSeek V2, V3) -- Mistral, Moonlight, Nemotron-H, GLM-4.5 - -**Mixture-of-Experts (MoE):** - -- Qwen 3 MoE, Qwen 3-Next -- DeepSeek V2, V3 -- OLMoE - -**Vision-Language (Multimodal):** - -- Gemma 3 VL -- Qwen 2.5 VL, Qwen 3 VL -- Nemotron Nano V2 VL - -### By Provider - -**Meta/LLaMA:** - -- LLaMA 3 - -**NVIDIA:** - -- LLaMA Nemotron -- Nemotron-H -- Nemotron Nano V2 VL - -**Alibaba Cloud:** - -- Qwen (2, 2.5, 3, 3 MoE, 3-Next) -- Qwen VL (2.5, 3) - -**Google:** - -- Gemma (2, 3) -- Gemma 3 VL - -**DeepSeek:** - -- DeepSeek (V2, V3) - -**Other:** - -- Mistral AI (Mistral) -- GLM-4.5 -- GPT-OSS -- Moonlight -- OLMoE - ---- - -## Conversion Support - -All models support bidirectional conversion: - -- **Hugging Face → Megatron Bridge**: Load pretrained weights for training -- **Megatron Bridge → Hugging Face**: Export trained models for deployment - -Conversion features: - -- Automatic architecture detection -- Parallelism-aware conversion (TP/PP/VPP/CP/EP) -- Streaming and memory-efficient transfers -- Verification mechanisms for conversion accuracy - -Refer to the [Bridge Guide](../bridge-guide.md) for detailed conversion instructions. - ---- - -**Ready to explore?** Choose a model category: - -- [Large Language Models (LLMs)](llm/README.md) -- [Vision Language Models (VLMs)](vlm/README.md) - -Or return to the [main documentation](../README.md). - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/super/slurm_pretrain.sh -```sh -#!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ============================================================================== -# Nemotron 3 Super Pretraining -# -# Nemotron 3 Super is a 120B parameter model with A12B (Active 12 Billion) architecture -# Supports multiple parallelism configs: each "TP,PP,EP,CP,SP" runs sequentially. -# -# Note: The default recipe uses NVFP4 mixed precision, which requires Blackwell GPUs. -# For Hopper GPUs, add: mixed_precision="bf16_mixed" to CLI_OVERRIDES. -# -# Usage: -# 1. Modify the #SBATCH directives below for your cluster -# 2. Set CONTAINER_IMAGE to your container path -# 3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled) -# 4. Submit: sbatch slurm_pretrain.sh -# ============================================================================== - -#SBATCH --job-name=nemotron3-super-pretrain -#SBATCH --nodes=8 -#SBATCH --ntasks-per-node=8 -#SBATCH --gpus-per-node=8 -#SBATCH --time=24:00:00 -#SBATCH --partition=gpu -#SBATCH --account=my_account -#SBATCH --output=logs/nemotron3_super_pretrain_%j.out -#SBATCH --error=logs/nemotron3_super_pretrain_%j.err -#SBATCH --exclusive - -# ============================================================================== -# CONFIGURATION -# ============================================================================== - -# Workspace directory for checkpoints and results -WORKSPACE=${WORKSPACE:-/workspace} - -# Model and training configurations -MODEL_NAME=nemotron_3_super -DATASET_NAME=mock -SEQ_LENGTH=4096 -TRAIN_ITERS=50 -GLOBAL_BATCH_SIZE=128 -MICRO_BATCH_SIZE=1 -EVAL_ITERS=10 -LR_WARMUP_ITERS=10 -LOG_INTERVAL=1 -WANDB_PROJECT=megatron-bridge-${DATASET_NAME} - -# Parallelism configs: "TP,PP,EP,CP,SP" per entry -PARALLELISM_CONFIGS=("8,1,64,1,True" "4,1,64,2,True") - -# Container image (required) -CONTAINER_IMAGE="" -# CONTAINER_IMAGE="/path/to/container.sqsh" - -# Container mounts (optional, space-separated) -CONTAINER_MOUNTS="" -# CONTAINER_MOUNTS="/data:/data /workspace:/workspace" - -# ============================================================================== -# Environment Setup -# ============================================================================== - -# NCCL optimizations for large-scale training -export TORCH_NCCL_AVOID_RECORD_STREAMS=1 -export NCCL_NVLS_ENABLE=0 - -# UV cache on shared filesystem (recommended for multi-node setups) -# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync -# export UV_CACHE_DIR="/path/to/shared/uv_cache" - -# HuggingFace cache directory (recommended for shared filesystem) -# export HF_HOME="/path/to/shared/HF_HOME" - -# Authentication tokens (set these for your environment) -# export HF_TOKEN= -# export WANDB_API_KEY= - -# ============================================================================== -# Job Execution -# ============================================================================== - -echo "======================================" -echo "Nemotron 3 Super Pretraining Job" -echo "======================================" -echo "Job ID: $SLURM_JOB_ID" -echo "Nodes: $SLURM_JOB_NUM_NODES" -echo "GPUs per node: $SLURM_GPUS_PER_NODE" -echo "Model: $MODEL_NAME" -echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}" -echo "======================================" - -# Create logs directory if it doesn't exist -mkdir -p logs - -# Require container image -if [ -z "$CONTAINER_IMAGE" ]; then - echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image." - exit 1 -fi - -# Build srun command (shared across configs) -SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE" -if [ -n "$CONTAINER_MOUNTS" ]; then - SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS" -fi -echo "SRUN base: $SRUN_CMD" -echo "======================================" - -# Run each parallelism config in sequence -CONFIG_INDEX=0 -for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do - IFS=',' read -r TP PP EP CP SP <<< "$CONFIG" - CONFIG_INDEX=$((CONFIG_INDEX + 1)) - echo "" - echo "======================================" - echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP" - echo "======================================" - - # Build CLI overrides for this config - CLI_OVERRIDES="\ - model.seq_length=$SEQ_LENGTH \ - train.train_iters=$TRAIN_ITERS \ - train.global_batch_size=$GLOBAL_BATCH_SIZE \ - train.micro_batch_size=$MICRO_BATCH_SIZE \ - train.eval_iters=$EVAL_ITERS \ - scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \ - checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \ - logger.log_interval=$LOG_INTERVAL \ - logger.wandb_project=$WANDB_PROJECT \ - logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \ - dataset.sequence_length=$SEQ_LENGTH \ - model.tensor_model_parallel_size=$TP \ - model.pipeline_model_parallel_size=$PP \ - model.expert_model_parallel_size=$EP \ - model.sequence_parallel=$SP \ - model.context_parallel_size=$CP" - - CMD="uv run --no-sync python scripts/training/run_recipe.py" - CMD="$CMD --recipe ${MODEL_NAME}_pretrain_config" - CMD="$CMD $CLI_OVERRIDES" - - echo "Executing command..." - echo $CMD - echo "======================================" - - $SRUN_CMD bash -c "$CMD" - RUN_EXIT=$? - if [ $RUN_EXIT -ne 0 ]; then - echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT" - exit $RUN_EXIT - fi -done - -echo "======================================" -echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)" -echo "======================================" - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/README.md -```md -# Large Language Models - -This directory contains documentation for Large Language Models (LLMs) supported by Megatron Bridge. Each model documentation includes examples for converting to/from 🤗 Hugging Face and links to training recipes. - -## Available Models - -Megatron Bridge supports the following LLM families: - -| Model | Documentation | Description | -|-------|---------------|-------------| -| **DeepSeek V2** | [deepseek-v2.md](deepseek-v2.md) | DeepSeek V2 model family | -| **DeepSeek V3** | [deepseek-v3.md](deepseek-v3.md) | DeepSeek V3 model family | -| **Gemma 2** | [gemma2.md](gemma2.md) | Google Gemma 2 models | -| **Gemma 3** | [gemma3.md](gemma3.md) | Google Gemma 3 models | -| **GLM-4.5** | [glm45.md](glm45.md) | GLM-4.5 model family | -| **GPT-OSS** | [gpt-oss.md](gpt-oss.md) | Open-source GPT-style models | -| **LLaMA 3** | [llama3.md](llama3.md) | Meta LLaMA 3 models | -| **LLaMA Nemotron** | [llama-nemotron.md](llama-nemotron.md) | NVIDIA LLaMA Nemotron models | -| **Mistral** | [mistral.md](mistral.md) | Mistral AI models | -| **Moonlight** | [moonlight.md](moonlight.md) | Moonlight model family | -| **Nemotron-3** | [nemotron3.md](nemotron3.md) | NVIDIA Nemotron-3 models | -| **Nemotron-3 Super** | [nemotron3-super.md](nemotron3-super.md) | NVIDIA Nemotron-3 Super models | -| **Nemotron-H** | [nemotronh.md](nemotronh.md) | NVIDIA Nemotron-H models | -| **OLMoE** | [olmoe.md](olmoe.md) | OLMoE (Open Language Model - Mixture of Experts) | -| **Qwen** | [qwen.md](qwen.md) | Alibaba Cloud Qwen model family | - -## Quick Navigation - -### I want to - -**🔍 Find a specific model** -→ Browse the model list above or use the [index page](index.md) - -**🔄 Convert models between formats** -→ Each model page includes conversion examples for Hugging Face ↔ Megatron Bridge - -**🚀 Get started with training** -→ See [Training Documentation](../../training/README.md) for training guides - -**📚 Understand model architecture** -→ Each model page documents architecture-specific features and configurations - -**🔧 Add support for a new model** -→ Refer to [Adding New Models](../../adding-new-models.md) - -## Related Documentation - -- **[Models Overview](../README.md)** - Return to main models documentation -- **[Vision Language Models](../vlm/README.md)** - VLM model documentation -- **[Training Documentation](../../training/README.md)** - Training and customization guides -- **[Bridge Guide](../../bridge-guide.md)** - Working with Hugging Face models -- **[Adding New Models](../../adding-new-models.md)** - Extending model support - -## Model Documentation Structure - -Each model documentation page typically includes: - -1. **Model Overview** - Architecture and key features -2. **Available Variants** - Supported model sizes and configurations -3. **Conversion Examples** - Converting between Hugging Face and Megatron formats -4. **Training Recipes** - Links to training configurations and examples -5. **Architecture Details** - Model-specific features and configurations - ---- - -**Ready to explore?** Choose a model from the list above or return to the [main documentation](../../README.md). - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/nemotronh.md -```md -# Nemotron H and Nemotron Nano v2 - -[Nemotron H](https://huggingface.co/collections/nvidia/nemotron-h) and [Nemotron Nano v2](https://huggingface.co/collections/nvidia/nvidia-nemotron-v2) are families of **hybrid SSM-Attention models** from **NVIDIA** that combine Mamba (State Space Model) layers with traditional attention layers. These models achieve strong performance while maintaining computational efficiency through their hybrid architecture. - -The Nemotron H family includes models from 4B to 56B parameters with 8K context length, while Nemotron Nano v2 models (9B and 12B) are optimized for edge deployment with extended 128K context support. - -## Model Families - -### Nemotron H -- **4B**: 52 layers, 3072 hidden size, 8K context -- **8B**: 52 layers, 4096 hidden size, 8K context -- **47B**: 98 layers, 8192 hidden size, 8K context -- **56B**: 118 layers, 8192 hidden size, 8K context - -### Nemotron Nano v2 -- **9B**: 56 layers, 4480 hidden size, 128K context -- **12B**: 62 layers, 5120 hidden size, 128K context - -All models are supported via the Bridge system with specialized configurations for hybrid SSM-Attention architecture. - -## Model Architecture - -### Common Features Across All Models -- **Architecture**: Hybrid SSM-Attention (Mamba + Multi-Query Attention) -- **SSM**: Mamba-2 selective state space layers -- **Attention**: Multi-query attention with QK LayerNorm and RoPE -- **Activation**: Squared ReLU (SwiGLU in FFN) -- **Normalization**: RMSNorm -- **Position Embedding**: RoPE (Rotary Position Embeddings) -- **Hybrid Pattern**: Configurable layer-wise mixing of Mamba ("M") and Attention ("*") layers - -### Nemotron H 4B Specifications -- **Parameters**: 4B -- **Layers**: 52 (Hybrid pattern: `M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-`) -- **Hidden size**: 3072 -- **FFN hidden size**: 12288 -- **Attention heads**: 32 query heads, 8 key-value groups -- **KV channels**: 128 -- **Mamba heads**: 112 -- **Mamba head dim**: 64 -- **Mamba state dim**: 128 -- **Context Length**: 8K tokens - -### Nemotron H 8B Specifications -- **Parameters**: 8B -- **Layers**: 52 (Hybrid pattern: `M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-`) -- **Hidden size**: 4096 -- **FFN hidden size**: 21504 -- **Attention heads**: 32 query heads, 8 key-value groups -- **KV channels**: 128 -- **Mamba heads**: 128 -- **Mamba head dim**: 64 -- **Mamba state dim**: 128 -- **Context Length**: 8K tokens - -### Nemotron H 47B Specifications -- **Parameters**: 47B -- **Layers**: 98 -- **Hidden size**: 8192 -- **FFN hidden size**: 30720 -- **Attention heads**: 64 query heads, 8 key-value groups -- **KV channels**: 128 -- **Mamba heads**: 256 -- **Mamba head dim**: 64 -- **Mamba state dim**: 256 -- **Context Length**: 8K tokens - -### Nemotron H 56B Specifications -- **Parameters**: 56B -- **Layers**: 118 -- **Hidden size**: 8192 -- **FFN hidden size**: 32768 -- **Attention heads**: 64 query heads, 8 key-value groups -- **KV channels**: 128 -- **Mamba heads**: 256 -- **Mamba head dim**: 64 -- **Mamba state dim**: 256 -- **Context Length**: 8K tokens - -### Nemotron Nano 9B v2 Specifications -- **Parameters**: 9B -- **Layers**: 56 (Hybrid pattern: `M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-`) -- **Hidden size**: 4480 -- **FFN hidden size**: 15680 -- **Attention heads**: 40 query heads, 8 key-value groups -- **KV channels**: 128 -- **Mamba heads**: 128 -- **Mamba head dim**: 80 -- **Mamba state dim**: 128 -- **Context Length**: 128K tokens -- **Vocab size**: 131,072 - -### Nemotron Nano 12B v2 Specifications -- **Parameters**: 12B -- **Layers**: 62 (Hybrid pattern: `M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-`) -- **Hidden size**: 5120 -- **FFN hidden size**: 20480 -- **Attention heads**: 40 query heads, 8 key-value groups -- **KV channels**: 128 -- **Mamba heads**: 128 -- **Mamba head dim**: 80 -- **Mamba state dim**: 128 -- **Context Length**: 128K tokens -- **Vocab size**: 131,072 - -## Key Features - -### Hybrid SSM-Attention Architecture -- **Mamba Layers (M)**: State space model layers for efficient long-range modeling -- **Attention Layers (*)**: Standard multi-query attention for complex reasoning -- **Configurable Pattern**: Each model has a predefined hybrid pattern balancing efficiency and performance - -### Advanced Optimizations -- **Squared ReLU Activation**: Enhanced non-linearity for better gradient flow -- **QK LayerNorm**: Applies LayerNorm to query and key projections for training stability -- **RoPE**: Rotary Position Embeddings with base 10000 -- **Multi-Query Attention**: Efficient attention with shared key-value heads -- **Selective State Space**: Mamba-2 architecture with selective gating - -### Extended Context (Nano v2) -- **128K Context Window**: Nemotron Nano v2 models support up to 128K tokens -- **Efficient Long-Range Modeling**: Hybrid architecture optimized for long sequences - -## Conversion with 🤗 Hugging Face - -### Load HF → Megatron - -#### Nemotron H Models -```python -from megatron.bridge import AutoBridge - -# Example: Nemotron H 8B -bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True) -provider = bridge.to_megatron_provider() - -# Configure parallelism before instantiating the model -provider.tensor_model_parallel_size = 2 -provider.pipeline_model_parallel_size = 1 -provider.context_parallel_size = 1 -provider.sequence_parallel = True - -provider.finalize() -model = provider.provide_distributed_model(wrap_with_ddp=False) - -# Other models: -# bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-4B-Base-8K", trust_remote_code=True) -# bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-47B-Base-8K", trust_remote_code=True) -# bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-56B-Base-8K", trust_remote_code=True) -``` - -#### Nemotron Nano v2 Models -```python -from megatron.bridge import AutoBridge - -# Example: Nemotron Nano 9B v2 -bridge = AutoBridge.from_hf_pretrained("nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base", trust_remote_code=True) -provider = bridge.to_megatron_provider() - -# Configure parallelism -provider.tensor_model_parallel_size = 2 -provider.pipeline_model_parallel_size = 1 -provider.context_parallel_size = 1 -provider.sequence_parallel = True - -provider.finalize() -model = provider.provide_distributed_model(wrap_with_ddp=False) - -# For instruct variant: -# bridge = AutoBridge.from_hf_pretrained("nvidia/NVIDIA-Nemotron-Nano-9B-v2", trust_remote_code=True) - -# For 12B model: -# bridge = AutoBridge.from_hf_pretrained("nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base", trust_remote_code=True) -``` - -### Export Megatron → HF -```python -# Convert from a Megatron checkpoint directory to HF format -bridge.export_ckpt( - megatron_path="/results/nemotronh_8b/checkpoints/iter_0500000", - hf_path="./nemotronh-8b-hf-export", -) -``` - -## Examples - -- Checkpoint conversion: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py) -- Training scripts: [examples/models/train_any_basic.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/train_any_basic.py) - -## Finetuning Recipes - -### Nemotron H 4B Finetuning - -#### LoRA Finetuning -```python -from megatron.bridge.recipes.nemotronh import nemotronh_4b_peft_config - -cfg = nemotronh_4b_peft_config( - tokenizer_path="nvidia/Nemotron-H-4B-Base-8K", - name="nemotronh_4b_lora", - pretrained_checkpoint="path/to/nemotronh/4b/checkpoint", - peft_scheme="lora", # or "dora" for DoRA - train_iters=1000, - global_batch_size=128, - finetune_lr=1e-4, -) -``` - -#### Full Supervised Finetuning (SFT) -```python -from megatron.bridge.recipes.nemotronh import nemotronh_4b_sft_config - -cfg = nemotronh_4b_sft_config( - tokenizer_path="nvidia/Nemotron-H-4B-Base-8K", - name="nemotronh_4b_sft", - pretrained_checkpoint="path/to/nemotronh/4b/checkpoint", - train_iters=1000, - global_batch_size=128, - finetune_lr=5e-6, # Lower LR for full SFT -) -``` - -### Nemotron H 8B Finetuning - -```python -from megatron.bridge.recipes.nemotronh import nemotronh_8b_peft_config - -# LoRA finetuning -cfg = nemotronh_8b_peft_config( - tokenizer_path="nvidia/Nemotron-H-8B-Base-8K", - name="nemotronh_8b_lora", - pretrained_checkpoint="path/to/nemotronh/8b/checkpoint", - peft_scheme="lora", - train_iters=1000, - global_batch_size=128, - finetune_lr=1e-4, -) -``` - -### Nemotron H 47B Finetuning - -```python -from megatron.bridge.recipes.nemotronh import nemotronh_47b_peft_config - -# LoRA finetuning (recommended for 47B) -cfg = nemotronh_47b_peft_config( - tokenizer_path="nvidia/Nemotron-H-47B-Base-8K", - name="nemotronh_47b_lora", - pretrained_checkpoint="path/to/nemotronh/47b/checkpoint", - peft_scheme="lora", - train_iters=1000, - global_batch_size=128, - finetune_lr=1e-4, -) -``` - -### Nemotron H 56B Finetuning - -```python -from megatron.bridge.recipes.nemotronh import nemotronh_56b_peft_config - -# LoRA finetuning (recommended for 56B) -cfg = nemotronh_56b_peft_config( - tokenizer_path="nvidia/Nemotron-H-56B-Base-8K", - name="nemotronh_56b_lora", - pretrained_checkpoint="path/to/nemotronh/56b/checkpoint", - peft_scheme="lora", - train_iters=1000, - global_batch_size=128, - finetune_lr=1e-4, -) -``` - -### Nemotron Nano 9B v2 Finetuning - -```python -from megatron.bridge.recipes.nemotronh import nemotron_nano_9b_v2_peft_config - -# LoRA finetuning -cfg = nemotron_nano_9b_v2_peft_config( - tokenizer_path="nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base", - name="nano_9b_v2_lora", - pretrained_checkpoint="path/to/nano/9b/v2/checkpoint", - peft_scheme="lora", - train_iters=1000, - global_batch_size=128, - seq_length=2048, # Can use up to 128K - finetune_lr=1e-4, -) -``` - -### Nemotron Nano 12B v2 Finetuning - -```python -from megatron.bridge.recipes.nemotronh import nemotron_nano_12b_v2_peft_config - -# LoRA finetuning -cfg = nemotron_nano_12b_v2_peft_config( - tokenizer_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base", - name="nano_12b_v2_lora", - pretrained_checkpoint="path/to/nano/12b/v2/checkpoint", - peft_scheme="lora", - train_iters=1000, - global_batch_size=128, - seq_length=2048, # Can use up to 128K - finetune_lr=1e-4, -) -``` - -## Default Configurations - -### Nemotron H Models - -#### 4B - LoRA (1 node, 8 GPUs) -- TP=1, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: False -- Precision: BF16 mixed -- Optimized for single-GPU finetuning - -#### 4B - Full SFT (1 node, 8 GPUs) -- TP=1, PP=1, CP=1, LR=5e-6 -- Sequence Parallel: False -- Precision: BF16 mixed - -#### 8B - LoRA (1 node, 8 GPUs) -- TP=1, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: False -- Precision: BF16 mixed - -#### 8B - Full SFT (1 node, 8 GPUs) -- TP=2, PP=1, CP=1, LR=5e-6 -- Sequence Parallel: True -- Precision: BF16 mixed - -#### 47B - LoRA (2+ nodes) -- TP=4, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: False -- Precision: FP8 hybrid (recommended) - -#### 47B - Full SFT (4+ nodes) -- TP=8, PP=1, CP=1, LR=5e-6 -- Sequence Parallel: True -- Precision: FP8 hybrid - -#### 56B - LoRA (2+ nodes) -- TP=4, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: False -- Precision: FP8 hybrid (recommended) - -#### 56B - Full SFT (4+ nodes) -- TP=8, PP=1, CP=1, LR=5e-6 -- Sequence Parallel: True -- Precision: FP8 hybrid - -### Nemotron Nano v2 Models - -#### 9B - LoRA (1 node, 8 GPUs) -- TP=2, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: True -- Precision: BF16 mixed -- Context: Up to 128K tokens - -#### 9B - Full SFT (1 node, 8 GPUs) -- TP=2, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: True -- Precision: BF16 mixed - -#### 12B - LoRA (2 nodes, 16 GPUs) -- TP=4, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: True -- Precision: FP8 hybrid (recommended) -- Context: Up to 128K tokens - -#### 12B - Full SFT (2 nodes, 16 GPUs) -- TP=4, PP=1, CP=1, LR=1e-4 -- Sequence Parallel: True -- Precision: FP8 hybrid - -## API Reference - -### Nemotron H -- Nemotron H recipes: [bridge.recipes.nemotronh](../../apidocs/bridge/bridge.recipes.nemotronh.md) -- Nemotron H model providers: [bridge.models.nemotronh](../../apidocs/bridge/bridge.models.nemotronh.md) - -### Nemotron Nano v2 -- Nemotron Nano v2 recipes: [bridge.recipes.nemotronh.nemotron_nano_v2](../../apidocs/bridge/bridge.recipes.nemotronh.md) -- Nemotron Nano v2 model providers: [bridge.models.nemotronh.NemotronNanoModelProvider9Bv2](../../apidocs/bridge/bridge.models.nemotronh.md) - -## Performance Optimizations - -### Memory Efficiency -- **Selective Recomputation**: Reduces activation memory for larger models -- **Sequence Parallelism**: Distributes sequence dimension across GPUs (enabled for 8B+) -- **Context Parallelism**: Support for ultra-long sequences (Nano v2) -- **Manual GC**: Aggressive garbage collection for stable memory usage -- **Precision-aware optimizer**: BF16/FP8 gradients with FP32 master weights - -### Compute Efficiency -- **Mamba-2 Optimizations**: Efficient selective state space computations -- **Hybrid Architecture**: Balanced mix of Mamba and Attention layers -- **Squared ReLU**: Efficient activation function with good gradient properties -- **RoPE Fusion**: Optional optimization for position embeddings -- **Multi-Query Attention**: Reduced KV cache memory and compute - -### Hybrid Pattern Optimization -The hybrid override pattern determines which layers use Mamba (M) vs Attention (*): -- **Mamba layers**: Fast, memory-efficient, good for long-range dependencies -- **Attention layers**: Better for complex reasoning and multi-token relationships -- **Optimal patterns**: Pre-configured per model size based on extensive experimentation - -## Pipeline Parallelism Layouts - -Nemotron H models support several PP configurations with pre-defined layouts: -- **PP=1**: No pipelining (default for most configurations) -- **PP=2**: Supported with symmetric layer splits -- **PP=4**: Supported for larger models (47B, 56B) -- **VP (Virtual Pipeline)**: Supported for reducing pipeline bubbles - -## Hugging Face Model Cards - -### Nemotron H Models -- **4B Base**: [nvidia/Nemotron-H-4B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-4B-Base-8K) -- **8B Base**: [nvidia/Nemotron-H-8B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-8B-Base-8K) -- **47B Base**: [nvidia/Nemotron-H-47B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-47B-Base-8K) -- **56B Base**: [nvidia/Nemotron-H-56B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-56B-Base-8K) - -### Nemotron Nano v2 Models -- **9B Base**: [nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base) -- **9B Instruct**: [nvidia/NVIDIA-Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2) -- **12B Base**: [nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base) -- **12B Instruct**: [nvidia/NVIDIA-Nemotron-Nano-12B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2) - -## Technical Resources - -### Research Papers -- **Nemotron Technical Report**: [arXiv:2508.14444](https://arxiv.org/abs/2508.14444) -- **Mamba-2**: [Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality](https://arxiv.org/abs/2405.21060) - -## Related Documentation - -- Recipe usage and customization: [Recipe usage](../../recipe-usage.md) -- Training configuration: [Configuration overview](../../training/config-container-overview.md) -- Training entry points: [Entry points](../../training/entry-points.md) -- PEFT methods (LoRA, DoRA): [PEFT Guide](../../training/peft.md) - - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/megatron-fsdp.md -```md -# Megatron FSDP - -Megatron FSDP is the practical fully sharded data parallel path in Megatron -Bridge today. It shards parameters, gradients, and optimizer state across data -parallel ranks, which can reduce model-state memory substantially compared with -plain Distributed Data Parallel (DDP) or the distributed optimizer path. - -This page is the stable overview for what Megatron FSDP is, when to use it, and -what constraints matter. For operational enablement, code anchors, and -verification commands, see [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md). - -## What It Is - -Megatron FSDP is the Megatron-Core custom FSDP implementation exposed in Bridge -through `use_megatron_fsdp`. - -Compared with other data-parallel strategies: - -| Feature | DDP | Distributed Optimizer | Megatron FSDP | -|---|---|---|---| -| Parameter Storage | Replicated | Replicated | Sharded | -| Optimizer States | Replicated | Sharded | Sharded | -| Gradient Communication | All-reduce | Reduce-scatter | Reduce-scatter | -| Parameter Communication | None | All-gather (after update) | All-gather (on-demand) | -| Memory Efficiency | Baseline | High | Highest | -| Communication Overhead | Low | Medium | Medium-High | - -The practical consequence is that Megatron FSDP is most useful when model-state -memory, rather than activation memory, is the main bottleneck. - -## When to Use It - -Megatron FSDP is a good fit when all of the following are true: - -- the model is too large for plain DDP or distributed optimizer -- you want the strongest currently supported FSDP path in Bridge -- you are willing to trade more communication for lower memory -- you can adopt the required FSDP checkpoint format - -Prefer another path when: - -- DDP already fits comfortably and simplicity matters most -- distributed optimizer gives enough memory relief without fully sharding -- you are evaluating PyTorch FSDP2 for production use on this branch - -## Stable Requirements - -Megatron FSDP in Bridge requires: - -- `use_megatron_fsdp` to be enabled -- checkpoint format `fsdp_dtensor` -- standard rank initialization order - -The `fsdp_dtensor` format uses PyTorch DTensor and -`torch.distributed.checkpoint` (DCP) to store sharded parameters and optimizer -state. It is **not interchangeable** with `torch_dist` or `zarr` checkpoints — -you cannot load an `fsdp_dtensor` checkpoint into a non-FSDP run or vice versa. - -`fsdp_dtensor` is compatible with 5D parallelism (TP + PP + DP + CP + EP). -Because DCP stores DTensor placement metadata, checkpoints saved under one -parallelism layout can be loaded under a different layout (e.g., change TP or PP -size between runs) — DCP handles the shard remapping automatically. The one -unsupported combination is `use_tp_pp_dp_mapping=True`, which uses an -alternative rank-initialization order that conflicts with FSDP sharding. - -Important stable constraints: - -- `use_megatron_fsdp` and `use_torch_fsdp2` are mutually exclusive -- `use_tp_pp_dp_mapping` is not supported with Megatron FSDP -- legacy checkpoint formats such as `torch_dist` and `zarr` are not valid for - Megatron FSDP save/load - -When Megatron FSDP is enabled, Bridge also adjusts some settings -automatically, including disabling `average_in_collective` and several -buffer-reuse optimizations that do not match the FSDP path. - -## Compatibility and Caveats - -At the configuration level, Megatron FSDP is intended to work with: - -- tensor parallelism -- pipeline parallelism -- context parallelism -- expert parallelism -- BF16 or FP16 mixed precision - -However, not every combination has the same level of in-repo validation or -performance evidence. Treat broad compatibility as code-supported first, not as -fully benchmark-proven for every combination. - -Two practical caveats matter most: - -1. Public recipes may expose `use_megatron_fsdp` while still defaulting to a - non-FSDP checkpoint format. The checkpoint requirement is stable and - mandatory even when recipe ergonomics lag behind. -2. FSDP reduces model-state memory, not activation memory. For long-sequence or - activation-bound workloads, other techniques such as context parallelism, - activation recomputation, or CPU offloading may still be needed. - -## Torch FSDP2 Status - -Megatron Bridge also exposes a PyTorch FSDP2 path via `use_torch_fsdp2`, but -that path should still be treated as experimental on this branch. - -The stable recommendation today is: - -- use Megatron FSDP if you need an FSDP path in Bridge -- do not treat FSDP2 as interchangeable with Megatron FSDP - -## Related Docs - -- [docs/training/checkpointing.md](checkpointing.md) -- [docs/training/cpu-offloading.md](cpu-offloading.md) -- [docs/performance-guide.md](../performance-guide.md) -- [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md) -- [skills/perf-techniques/megatron-fsdp/card.yaml](../skills/perf-techniques/megatron-fsdp/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/attention-optimizations.md -```md -# Attention Optimizations - -Megatron Bridge provides several attention optimizations to improve the efficiency and performance of transformer models. These optimizations include Flash Attention for memory efficiency, and Multi-Query Attention (MQA) and Grouped-Query Attention (GQA) for computational efficiency. - -## Flash Attention - -### Overview - -Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences. - -Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency: - -1. **Tiling technique**: Decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, and value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step. - -2. **Recomputation technique**: Stores the softmax normalization factors (linear to sequence length), instead of the softmax results (quadratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the I/O traffic between global memory and shared memory. - -Flash attention lowers the memory footprint and computational complexity from quadratic to linear, greatly extending the range of sequence length allowed in large language models. - -### Configure Flash Attention - -In Megatron Bridge, flash attention is configured through the `attention_backend` parameter in your model configuration. The framework supports multiple attention backends through Transformer Engine integration: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.core.transformer.enums import AttnBackend - -# Configure model with flash attention (default) -model_config = GPTModelProvider( - attention_backend=AttnBackend.auto, # Let TE choose the best backend (default) - # ... other model parameters -) - -# Or explicitly specify flash attention -model_config = GPTModelProvider( - attention_backend=AttnBackend.flash_attn, # Explicitly use flash attention - # ... other model parameters -) -``` - -### Attention Backend Options - -Megatron Bridge supports several attention backends through the `attention_backend` configuration: - -- `AttnBackend.auto`: Automatically selects the best available backend (recommended) -- `AttnBackend.flash_attn`: Explicitly use Flash Attention implementation -- `AttnBackend.fused_attn`: Use cuDNN fused attention (when available) -- `AttnBackend.local`: Use local PyTorch implementation (for debugging) - -### Environment Variable Control - -For fine-grained control, you can still use environment variables to disable specific implementations: - -```bash -# Disable flash attention -export NVTE_FLASH_ATTN=0 - -# Disable cuDNN flash attention -export NVTE_FUSED_ATTN=0 -``` - -However, the recommended approach is to use the `attention_backend` configuration parameter. - -## Multi-query Attention (MQA) and Grouped-query Attention (GQA) - -**Multi-query Attention (MQA)** and **Grouped-query Attention (GQA)** are modifications of the traditional multihead attention mechanism in Transformer models. These methods improve the efficiency and effectiveness of attention mechanisms. - -### Overview - -**Multi-query Attention (MQA)** - -MQA treats all attention heads as a single group, reducing computational complexity and accelerating training times. It is beneficial when model scalability or limited computational resources are concerns. - -**Grouped-query Attention (GQA)** - -GQA groups the heads into clusters, each processing a subset of queries independently. This method balances the detailed focus of traditional multihead attention with the broad approach of MQA, enhancing nuanced input data processing. - -These attention variants offer: - -- **Reduced computational load**: Both methods decrease computation, beneficial for large models -- **Increased processing speed**: Simplifying attention leads to faster training and inference -- **Flexibility and adaptability**: Adjustments can be made based on task needs or hardware constraints - -### Enable MQA and GQA - -To use MQA or GQA in Megatron Bridge, adjust the `num_query_groups` parameter in your model configuration: - -#### Multi-query Attention (MQA) -Set `num_query_groups` to 1 to treat all attention heads as a single group: - -```python -from megatron.bridge.models import GPTModelProvider - -model_config = GPTModelProvider( - num_attention_heads=32, - num_query_groups=1, # Enables Multi-query Attention - # ... other model parameters -) -``` - -#### Grouped-query Attention (GQA) -Set `num_query_groups` to a number that is a divisor of the total number of attention heads (more than one but less than the total heads): - -```python -model_config = GPTModelProvider( - num_attention_heads=32, - num_query_groups=8, # Enables Grouped-query Attention (4 heads per group) - # ... other model parameters -) -``` - -#### Regular Multihead Attention -For regular attention, set this parameter to `None` or match it with the number of heads: - -```python -model_config = GPTModelProvider( - num_attention_heads=32, - num_query_groups=None, # Default setting for regular multihead attention - # Or equivalently: - # num_query_groups=32, # One group per head - # ... other model parameters -) -``` - -## Resources - -- [Megatron Core Attention Implementation](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py) -- [Flash Attention Paper](https://arxiv.org/abs/2205.14135) -- [Transformer Engine Attention Mechanisms](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/attention/attention.html) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/parallelisms.md -```md -# Parallelisms Guide - -Megatron Bridge supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. These parallelism strategies are configured through model provider classes and leverage Megatron Core's implementation for performance and memory efficiency. - -## Data Parallelism - -Data Parallelism (DP) replicates the model across multiple GPUs. Data batches are evenly distributed between GPUs and the data-parallel GPUs process them independently. While the computation workload is efficiently distributed across GPUs, inter-GPU communication is required to keep the model replicas consistent between training steps. - -### Distributed Data Parallelism - -Distributed Data Parallelism (DDP) keeps the model copies consistent by synchronizing parameter gradients across data-parallel GPUs before each parameter update. More specifically, it sums the gradients of all model copies using all-reduce communication collectives. - -![Distributed Data Parallelism](images/ddp.gif) -*Figure: Distributed Data Parallelism synchronizes gradients across multiple GPUs using all-reduce operations.* - -### Distributed Optimizer - -[Distributed optimizer](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/dist_optimizer.html) is a memory-optimized data-parallel deployment method. It shards the optimizer states and the high-precision master parameters across data-parallel GPUs instead of replicating them. At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. This approach significantly reduces the memory need of large-scale LLM training. - -### Enable Data Parallelism - -In Megatron Bridge, DDP is the default parallel deployment method. The total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group. - -To enable the distributed optimizer, configure the {py:class}`bridge.training.config.OptimizerConfig` and {py:class}`bridge.training.config.DistributedDataParallelConfig` - -```python -from megatron.bridge.training.config import ConfigContainer, DistributedDataParallelConfig, OptimizerConfig - -optimizer_config = OptimizerConfig( - optimizer="adam", - lr=3e-4, - weight_decay=0.1, - adam_beta1=0.9, - adam_beta2=0.95, - use_distributed_optimizer=True, - clip_grad=1.0, -) -ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) - -config = ConfigContainer( - ddp=ddp_config, - optimizer=optimizer_config, - # ... other config parameters -) -``` - -For more optimizer options, refer to the {py:class}`bridge.training.config.OptimizerConfig` API documentation. - -## Model Parallelism - -Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need for per-GPU memory. Megatron Bridge supports various model-parallel methods through Megatron Core, which can be mixed to maximize LLM training performance. - -### Tensor Parallelism - -Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads. - -![Tensor Parallelism Overview](images/tp1.png) -*Figure 1: Tensor Parallelism distributes individual layer parameters across multiple GPUs.* - -![Tensor Parallelism Implementation](images/tp2.png) -*Figure 2: Detailed view of how tensor parallelism splits weight matrices and synchronizes computations.* - -#### Enable Tensor Parallelism - -To enable TP in Megatron Bridge, configure the `tensor_model_parallel_size` parameter in your model provider. This parameter determines the number of GPUs among which the model's tensors are partitioned. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer - -# Configure model with tensor parallelism -model_config = GPTModelProvider( - tensor_model_parallel_size=2, # Enable TP across 2 GPUs - # ... other model parameters -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Implement Tensor Parallelism - -Megatron Bridge integrates TP through the implementation from Megatron Core. For detailed API usage and additional configurations, consult the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.tensor_parallel.html). - -### Pipeline Parallelism - -Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially. - -![Pipeline Parallelism](images/pp.gif) -*Figure: Pipeline Parallelism distributes consecutive layers across multiple GPUs, processing batches in a pipeline fashion.* - -#### Enable Pipeline Parallelism - -To utilize Pipeline Parallelism in Megatron Bridge, set the `pipeline_model_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer - -# Configure model with pipeline parallelism -model_config = GPTModelProvider( - pipeline_model_parallel_size=4, # Distribute layers across 4 GPUs - # ... other model parameters -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Interleaved Pipeline Parallel Schedule - -To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. Enable this by setting `virtual_pipeline_model_parallel_size`: - -```python -model_config = GPTModelProvider( - pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=2, # 2 model chunks per pipeline stage - # ... other model parameters -) -``` - -For more insights into this approach, see the detailed blog: [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism). - -#### Implement Pipeline Parallelism - -The Megatron Bridge implementation of PP leverages functionalities from Megatron Core. For more detailed API usage and configurations related to PP, visit the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.pipeline_parallel.html). - -### Expert Parallelism and Mixture of Experts (MoE) - -Expert Parallelism (EP) is a type of model parallelism that distributes experts of a Mixture of Experts (MoE) model across GPUs. Unlike other model-parallel techniques, EP is applied to only the expert layers and does not impact the parallel mapping of the rest of the layers. - -MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert focuses on a specific subtask or domain, while a gating network dynamically activates the most appropriate expert based on the current input. - -![Expert Parallelism](images/ep.png) -*Figure: Expert Parallelism distributes MoE experts across multiple GPUs while keeping other layers replicated.* - -#### Basic MoE Configuration - -To enable MoE in Megatron Bridge, configure the basic MoE parameters in your model provider: - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure basic MoE model -model_config = GPTModelProvider( - num_moe_experts=8, # Number of experts in the MoE module - moe_router_topk=2, # Number of experts activated per token - moe_ffn_hidden_size=8192, # Hidden size for expert FFN layers - # ... other model parameters -) -``` - -#### Enable Expert Parallelism - -To enable EP, set `expert_model_parallel_size` in your model configuration. For example, if the model has eight experts (`num_moe_experts=8`), then setting `expert_model_parallel_size=4` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. - -```python -# Configure MoE model with expert parallelism -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, # Distribute 8 experts across 4 GPUs (2 experts per GPU) - # ... other model parameters -) -``` - -#### Enable Expert Tensor Parallelism - -To enable Expert Tensor Parallelism (ETP), set `expert_tensor_parallel_size` in your model configuration: - -```python -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - expert_tensor_parallel_size=2, # Apply tensor parallelism within each expert - # ... other model parameters -) -``` - -#### Advanced MoE Features - -Megatron Bridge provides several advanced optimizations for MoE models to improve performance on modern GPU architectures. - -##### DeepEP and HybridEP Optimizations - -DeepEP and HybridEP are high-performance MoE token dispatchers that improve throughput and efficiency on specific GPU architectures: - -- **DeepEP**: Optimized for Ampere, Hopper, B200, and B300 GPUs -- **HybridEP**: Optimized for GB200, GB300 with NVL72, and Ampere, Hopper, B200, B300 GPUs - -These dispatchers replace the standard token routing mechanism with an optimized "flex" dispatcher that provides better performance for MoE workloads. - -**Enable DeepEP:** - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend - -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - # ... other model parameters -) - -# Apply DeepEP optimization -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep") -``` - -**Enable HybridEP:** - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend - -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - # ... other model parameters -) - -# Apply HybridEP optimization -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="hybridep") -``` - -**GPU Architecture Requirements:** - -- **DeepEP**: Ampere (SM 8.x), Hopper (SM 9.x), B200, B300 -- **HybridEP**: GB200, GB300 with NVL72, Ampere (SM 8.x), Hopper (SM 9.x), B200, B300 - -The system automatically validates GPU compatibility and issues warnings if the dispatcher is not supported on the current hardware. - -##### Token Dropping for Load Balancing - -Token dropping improves MoE performance by balancing work across experts through capacity factors. This feature allows the model to drop tokens when experts are overloaded, preventing stragglers and improving overall throughput. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop - -model_config = GPTModelProvider( - num_moe_experts=8, - moe_router_topk=2, - moe_token_dispatcher_type="alltoall", # Required for token dropping - moe_router_load_balancing_type="aux_loss", # Required load balancing type - # ... other model parameters -) - -# Apply token dropping with capacity factor -apply_moe_token_drop( - model_config, - moe_expert_capacity_factor=1.0, # Capacity multiplier per expert - moe_pad_expert_input_to_capacity=True, # Pad inputs to capacity length -) -``` - -**Configuration Parameters:** - -- `moe_expert_capacity_factor`: Controls the maximum number of tokens each expert can process. A factor of 1.0 means each expert can handle exactly its proportional share of tokens. Lower values (e.g., 0.8) drop more tokens but improve load balancing. -- `moe_pad_expert_input_to_capacity`: When enabled, pads expert inputs to the capacity length for consistent batch sizes. - -**Requirements:** - -- Token dispatcher must be `alltoall` or `alltoall_seq` -- Load balancing type must be `aux_loss`, `seq_aux_loss`, or `none` - -**Trade-offs:** - -Token dropping can improve training throughput by 10-30% in imbalanced MoE models, but may affect convergence if too aggressive. Start with a capacity factor of 1.0 and gradually reduce if needed. - -#### Complete MoE Configuration Example - -Here's a complete example showing how to configure an MoE model with advanced optimizations: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend -from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop - -# Configure MoE model with expert parallelism -model_config = GPTModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - - # MoE configuration - num_moe_experts=8, # 8 experts total - moe_router_topk=2, # Activate 2 experts per token - moe_ffn_hidden_size=8192, # Expert FFN hidden dimension - moe_token_dispatcher_type="alltoall", # Token dispatcher type - moe_router_load_balancing_type="aux_loss", # Load balancing - - # Expert parallelism - expert_model_parallel_size=4, # Distribute experts across 4 GPUs - expert_tensor_parallel_size=2, # Apply TP within each expert - - # ... other model parameters -) - -# Apply DeepEP optimization (for Ampere/Hopper GPUs) -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep") - -# Apply token dropping for load balancing -apply_moe_token_drop( - model_config, - moe_expert_capacity_factor=1.0, - moe_pad_expert_input_to_capacity=True, -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Expert Parallelism Implementation - -The Megatron Bridge implementation of EP uses functionality from Megatron Core. Please consult the [Megatron Core MoE layer](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/moe_layer.py#L42) for more MoE implementation details. - -## Activation Partitioning - -In LLM training, a large memory space is needed to store the input activations of the network layers. Megatron Bridge provides effective activation distribution methods through Megatron Core, which is critical in training LLMs with large sequence lengths or large per-GPU micro-batch sizes. - -### Sequence Parallelism - -Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. - -![Sequence Parallelism](images/sp.png) -*Figure: Sequence Parallelism distributes the sequence dimension across multiple GPUs, reducing activation memory.* - -#### Enable Sequence Parallelism - -To utilize SP in Megatron Bridge, set the `sequence_parallel` parameter to `True` in your model configuration. Note that this feature is effective only when the tensor parallel size (`tensor_model_parallel_size`) is greater than `1`. - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure model with sequence parallelism -model_config = GPTModelProvider( - tensor_model_parallel_size=2, # Required for sequence parallelism - sequence_parallel=True, # Enable sequence parallelism - # ... other model parameters -) -``` - -#### Implement Sequence Parallelism - -The Megatron Bridge implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code: [Megatron-LM Sequence Parallel Source Code](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py). - -### Context Parallelism - -Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs by partitioning the input tensors along the sequence dimension. Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers. - -CP is critical for training long context models, as it allows the model to handle longer sequences by distributing the sequence activations across multiple GPUs. This method reduces the memory footprint and computational cost of processing long sequences. - -#### Enable Context Parallelism - -To activate CP in Megatron Bridge, set the `context_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs across which the model's sequence activations are distributed. - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure model with context parallelism -model_config = GPTModelProvider( - context_parallel_size=2, # Distribute sequence across 2 GPUs - # ... other model parameters -) -``` - -For long context training scenarios, context parallelism is particularly effective and essential for handling sequences that exceed the memory capacity of individual GPUs. - -#### Implement Context Parallelism - -Megatron Bridge leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. - -For more detailed technical information and implementation details, visit: -- [Megatron Core Context Parallelism Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/context_parallel.html) -- [Megatron Core wrappers for Transformer Engine](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py) -- [Transformer Engine attention modules](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) - -## Combined Parallelism Example - -Megatron Bridge allows you to combine multiple parallelism strategies for optimal performance and memory efficiency: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer, OptimizerConfig - -# Configure model with multiple parallelism strategies -model_config = GPTModelProvider( - # Model parallelism - tensor_model_parallel_size=2, # 2-way tensor parallelism - pipeline_model_parallel_size=4, # 4-way pipeline parallelism - virtual_pipeline_model_parallel_size=2, # Interleaved pipeline - - # Activation partitioning - sequence_parallel=True, # Enable sequence parallelism (requires TP > 1) - context_parallel_size=2, # 2-way context parallelism - - # Expert parallelism (for MoE models) - num_moe_experts=8, # 8 experts - expert_model_parallel_size=4, # Distribute experts across 4 GPUs - - # ... other model parameters -) - -# Configure distributed optimizer -optimizer_config = OptimizerConfig( - optimizer="adam", - use_distributed_optimizer=True, # Enable distributed optimizer - # ... other optimizer parameters -) - -config = ConfigContainer( - model=model_config, - optimizer=optimizer_config, - # ... other config parameters -) -``` - -## Data Parallel Size Calculation - -The data parallel size is automatically calculated based on the total world size and model parallelism settings: - -``` -data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size) -``` - -For example, with 32 GPUs total and the configuration above: -- `tensor_model_parallel_size = 2` -- `pipeline_model_parallel_size = 4` -- `context_parallel_size = 2` -- `data_parallel_size = 32 / (2 × 4 × 2) = 2` - -## Strategy Selection Guide - -Choosing the right combination depends on model size, hardware topology, -and sequence length. - -### Dense Models by Size - -| Model size | GPUs | Recommended starting point | -|---|---|---| -| < 1B | 1-8 | DP only | -| 1-10B | 8-16 | TP=2-4 + DP | -| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP | -| 70-175B | 64-256 | TP=8 + PP=4-8 + DP | -| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP | - -### MoE Models - -MoE models differ fundamentally from dense models: only a fraction of -parameters are active per token, so TP can often stay at 1 or 2. EP is -the primary scaling dimension. - -| Total / active params | Typical layout | -|---|---| -| < 20B | EP only (TP=1, PP=1) | -| 20-100B | TP=1-2 + PP=2-4 + EP=8-16 | -| 100-500B | TP=2-4 + PP=8-16 + EP=8-32 | -| 500B+ | TP=2 + PP=16 + EP=32-64 | - -### By Hardware Topology - -- **Single node with NVLink**: maximize TP within the node (up to TP=8). -- **Multiple nodes with InfiniBand**: keep TP within a node, use PP across nodes. -- **Limited network (Ethernet)**: minimize TP, prefer PP for cross-node scaling. - -### By Sequence Length - -| Sequence length | Recommendation | -|---|---| -| < 2K | standard TP + PP + DP | -| 2K-8K | add SP (`sequence_parallel=True`) | -| 8K-32K | add CP=2 | -| 32K+ | add CP=4-8, consider hierarchical CP | - -For operational details on configuring combined parallelism, troubleshooting -layouts, and memory estimation, see the -[parallelism strategies skill](skills/perf-techniques/parallelism-strategies/SKILL.md). - -## Configuration Guidelines - -### Memory Optimization -- Use **distributed optimizer** to reduce optimizer state memory -- Enable **sequence parallelism** when using tensor parallelism to reduce activation memory -- Use **context parallelism** for long sequence training -- Consider **pipeline parallelism** for very large models that don't fit on a single GPU - -### Performance Optimization -- **Tensor parallelism** works best within a single node (high bandwidth) -- **Pipeline parallelism** can work across nodes but requires careful batch size tuning -- **Context parallelism** is essential for long context scenarios -- **Expert parallelism** is specific to MoE models and should match the number of experts -- **DeepEP/HybridEP** provide optimized MoE token dispatching on supported GPU architectures - -### Compatibility -- **Sequence parallelism** requires `tensor_model_parallel_size > 1` -- **Expert parallelism** requires MoE models (`num_moe_experts > 0`) -- **DeepEP** requires Ampere, Hopper, B200, or B300 GPUs -- **HybridEP** requires GB200, GB300 with NVL72, or Ampere, Hopper, B200, B300 GPUs -- **Token dropping** requires `alltoall` or `alltoall_seq` token dispatcher -- All parallelism strategies can be combined, but total parallelism must divide evenly into the world size - -## Related Artifacts - -- **Operational skill**: [skills/perf-techniques/parallelism-strategies/SKILL.md](skills/perf-techniques/parallelism-strategies/SKILL.md) — enablement, pitfalls, memory estimation, verification -- **Knowledge card**: [skills/perf-techniques/parallelism-strategies/card.yaml](skills/perf-techniques/parallelism-strategies/card.yaml) — structured metadata and validation status - -## Resources - -- [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/) -- [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/) -- [Megatron-LM Repository](https://github.com/NVIDIA/Megatron-LM) -- [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/README.md -```md -# Nemotron 3 Examples - -This directory contains example scripts for Nemotron 3 language models: - -| Model | Parameters | Active Parameters | Subdirectory | -|-------|-----------|-------------------|--------------| -| Nemotron 3 Nano | 30B | A3B | [nano/](nano/) | -| Nemotron 3 Super | 120B | A12B | [super/](super/) | - -## Workspace Configuration - -All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it: - -```bash -export WORKSPACE=/your/custom/path -``` - -Directory structure: -- `${WORKSPACE}/models/` - Converted checkpoints -- `${WORKSPACE}/results/` - Training outputs and experiment results - -## Checkpoint Conversion - -Each model has its own conversion script: [nano/conversion.sh](nano/conversion.sh), [super/conversion.sh](super/conversion.sh). - -## Training Recipes - -Available recipes: - -**Nano** ([source](../../../src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py)): -- `nemotron_3_nano_pretrain_config`: Pretraining -- `nemotron_3_nano_sft_config`: Supervised fine-tuning -- `nemotron_3_nano_peft_config`: PEFT with LoRA support - -**Super** ([source](../../../src/megatron/bridge/recipes/nemotronh/nemotron_3_super.py)): -- `nemotron_3_super_pretrain_config`: Pretraining -- `nemotron_3_super_sft_config`: Supervised fine-tuning -- `nemotron_3_super_peft_config`: PEFT with LoRA support - -Before training, ensure the following are configured: -1. **Container Image**: Set `CONTAINER_IMAGE` in the SLURM scripts to your container path -2. **Container Mounts**: (optional) Set `CONTAINER_MOUNTS` for data and workspace directories -3. **Environment Variables**: - - `HF_TOKEN`: to download models from HF Hub (if required) - - `HF_HOME`: (optional) to avoid re-downloading models and datasets - - `WANDB_API_KEY`: (optional) to enable WandB logging - -All training scripts use SLURM for containerized multi-node training. - -### Nano - -See the SLURM scripts in [nano/](nano/): [slurm_pretrain.sh](nano/slurm_pretrain.sh), [slurm_sft.sh](nano/slurm_sft.sh), [slurm_peft.sh](nano/slurm_peft.sh). - -### Super - -See the SLURM scripts in [super/](super/): [slurm_pretrain.sh](super/slurm_pretrain.sh), [slurm_sft.sh](super/slurm_sft.sh), [slurm_peft.sh](super/slurm_peft.sh). - -## Evaluation - -Coming soon. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_super.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch - -from megatron.bridge import AutoBridge -from megatron.bridge.peft.base import PEFT -from megatron.bridge.peft.lora import LoRA -from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common -from megatron.bridge.recipes.utils.finetune_utils import default_peft_config -from megatron.bridge.training.config import ConfigContainer - - -NEMOTRON_3_SUPER_HF_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16" - - -def nemotron_3_super_pretrain_config() -> ConfigContainer: - """Return a pre-training config for Nemotron 3 Super (120B-A12B LatentMoE). - - This is a Latent MoE model with Multi-Token Prediction (MTP). Default parallelism: - - TP=4, PP=1, EP=8, SP=True - - Returns: - ConfigContainer: Pre-training configuration for Nemotron 3 Super. - """ - cfg = _pretrain_common() - - # Model Configuration (LatentMoE with MTP) — derived from HF config via AutoBridge - cfg.model = AutoBridge.from_hf_pretrained(NEMOTRON_3_SUPER_HF_MODEL_ID).to_megatron_provider(load_weights=False) - - # Parallelism Settings - cfg.model.tensor_model_parallel_size = 4 - cfg.model.pipeline_model_parallel_size = 1 - cfg.model.pipeline_dtype = torch.bfloat16 - cfg.model.virtual_pipeline_model_parallel_size = None - cfg.model.context_parallel_size = 1 - cfg.model.sequence_parallel = True - cfg.model.expert_tensor_parallel_size = 1 - cfg.model.expert_model_parallel_size = 8 - cfg.model.pipeline_model_parallel_layout = None - cfg.model.seq_length = 8192 - - # Tokenizer (--tokenizer-model) - cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - - # Dataset Configuration - cfg.dataset.seq_length = 8192 - cfg.dataset.blend = None - cfg.dataset.num_workers = 1 - cfg.dataset.mmap_bin_files = False - - # MoE Token Dispatcher Settings - cfg.model.moe_token_dispatcher_type = "alltoall" - cfg.model.moe_shared_expert_overlap = False - cfg.model.moe_flex_dispatcher_backend = "hybridep" - - # Training Configuration - cfg.train.train_iters = 39735 - cfg.train.global_batch_size = 3072 - cfg.train.micro_batch_size = 1 - cfg.train.manual_gc = False - cfg.train.manual_gc_interval = 0 - - # Validation - cfg.validation.eval_interval = 1000 - - # Transformer Engine (TE) - cfg.model.transformer_impl = "transformer_engine" - - # CUDA Graph (TE impl + partial scopes: ~40% throughput gain over disabled) - cfg.model.cuda_graph_impl = "transformer_engine" - cfg.model.cuda_graph_scope = ["attn", "mamba", "moe_router", "moe_preprocess"] - cfg.model.cuda_graph_warmup_steps = 3 - - # Kernel Selections - cfg.model.attention_backend = "fused" - cfg.model.cross_entropy_fusion_impl = "te" - cfg.model.use_te_rng_tracker = True - - # MTP Settings (HF config has num_nextn_predict_layers=1 for the shared block; - # mtp_num_layers=2 controls forward-pass repetitions with mtp_use_repeated_layer) - cfg.model.mtp_num_layers = 2 - cfg.model.keep_mtp_spec_in_bf16 = True - cfg.model.calculate_per_token_loss = True - cfg.model.mtp_loss_scaling_factor = 0.3 - cfg.model.mtp_use_repeated_layer = True - - # Mixed Precision - cfg.mixed_precision = "nemotron_3_super_bf16_with_nvfp4_mixed" - - # Optimizer hyperparameters - cfg.optimizer.lr = 4.5e-4 - cfg.optimizer.min_lr = 4.5e-6 - cfg.optimizer.weight_decay = 0.1 - cfg.optimizer.adam_beta1 = 0.9 - cfg.optimizer.adam_beta2 = 0.95 - cfg.optimizer.adam_eps = 1e-8 - cfg.scheduler.lr_warmup_iters = 333 - cfg.scheduler.start_weight_decay = 0.1 - cfg.scheduler.end_weight_decay = 0.1 - cfg.scheduler.lr_decay_style = "WSD" - - # Checkpoint Configuration - cfg.checkpoint.save_interval = 200 - cfg.checkpoint.ckpt_assume_constant_structure = True - cfg.checkpoint.dist_ckpt_strictness = "log_all" - cfg.checkpoint.async_save = True - - # DDP Configuration - cfg.ddp.overlap_grad_reduce = True - cfg.ddp.overlap_param_gather = True - cfg.ddp.check_for_nan_in_grad = True - cfg.ddp.use_distributed_optimizer = True - cfg.ddp.average_in_collective = False - - cfg.model.init_method_std = 0.014 - cfg.model.apply_rope_fusion = False - cfg.model.gradient_accumulation_fusion = True - cfg.model.use_fused_weighted_squared_relu = True - - return cfg - - -# ============================================================================= -# SFT Config -# ============================================================================= - - -def nemotron_3_super_sft_config() -> ConfigContainer: - """Return a full SFT config for Nemotron 3 Super (120B-A12B LatentMoE). - - Default parallelism: TP=1, PP=1, EP=8, SP=True - - Returns: - ConfigContainer with all settings pre-configured for Nemotron 3 Super SFT. - """ - cfg = _sft_common() - - # Model config — derived from HF config via AutoBridge - cfg.model = AutoBridge.from_hf_pretrained(NEMOTRON_3_SUPER_HF_MODEL_ID).to_megatron_provider(load_weights=False) - - # Parallelism settings - cfg.model.tensor_model_parallel_size = 1 - cfg.model.pipeline_model_parallel_size = 1 - cfg.model.pipeline_dtype = torch.bfloat16 - cfg.model.virtual_pipeline_model_parallel_size = None - cfg.model.context_parallel_size = 1 - cfg.model.sequence_parallel = True - cfg.model.expert_tensor_parallel_size = 1 - cfg.model.expert_model_parallel_size = 8 - cfg.model.pipeline_model_parallel_layout = None - cfg.model.seq_length = 2048 - - # Training-specific model overrides - cfg.model.apply_rope_fusion = False - cfg.model.attention_backend = "fused" - cfg.model.gradient_accumulation_fusion = True - cfg.model.init_method_std = 0.014 - cfg.model.use_fused_weighted_squared_relu = True - cfg.model.calculate_per_token_loss = True - - # MoE Token Dispatcher Settings - cfg.model.moe_token_dispatcher_type = "alltoall" - cfg.model.moe_shared_expert_overlap = False - cfg.model.moe_flex_dispatcher_backend = "hybridep" - - # CUDA Graph disabled — packed-sequence SFT passes explicit attention masks that - # are incompatible with CUDA graph capture/replay in Mamba layers. - cfg.model.cuda_graph_impl = "none" - cfg.model.cuda_graph_scope = [] - - # MTP Settings (HF config has num_nextn_predict_layers=1 for the shared block; - # mtp_num_layers=2 controls forward-pass repetitions with mtp_use_repeated_layer) - cfg.model.mtp_num_layers = 2 - cfg.model.keep_mtp_spec_in_bf16 = True - cfg.model.mtp_loss_scaling_factor = 0.3 - cfg.model.mtp_use_repeated_layer = True - cfg.model.use_te_rng_tracker = True - - # Optimizer overrides - cfg.optimizer.lr = 5e-6 - cfg.optimizer.adam_beta1 = 0.9 - cfg.optimizer.adam_beta2 = 0.95 - cfg.optimizer.adam_eps = 1e-8 - cfg.optimizer.weight_decay = 0.1 - cfg.scheduler.start_weight_decay = 0.1 - cfg.scheduler.end_weight_decay = 0.1 - cfg.scheduler.lr_decay_style = "cosine" - - # Tokenizer - cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - - # Checkpoint config overrides - cfg.checkpoint.save_interval = 200 - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.dist_ckpt_strictness = "log_all" - cfg.checkpoint.ckpt_assume_constant_structure = True - cfg.checkpoint.async_save = True - - # Logger config - cfg.logger.log_interval = 10 - - # RNG config - cfg.rng.seed = 1234 - - # DDP config - cfg.ddp.check_for_nan_in_grad = True - cfg.ddp.grad_reduce_in_fp32 = True - cfg.ddp.overlap_grad_reduce = True - cfg.ddp.overlap_param_gather = True - cfg.ddp.use_distributed_optimizer = True - - return cfg - - -# ============================================================================= -# PEFT Config -# ============================================================================= - - -def nemotron_3_super_peft_config( - peft_scheme: str | PEFT = "lora", -) -> ConfigContainer: - """Return a PEFT config for Nemotron 3 Super (120B-A12B LatentMoE). - - Default parallelism: TP=1, PP=1, EP=1, SP=True - - Args: - peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. - - Returns: - ConfigContainer with all settings pre-configured for Nemotron 3 Super PEFT. - """ - cfg = _peft_common() - - # Model config — derived from HF config via AutoBridge - cfg.model = AutoBridge.from_hf_pretrained(NEMOTRON_3_SUPER_HF_MODEL_ID).to_megatron_provider(load_weights=False) - - # Parallelism settings - cfg.model.tensor_model_parallel_size = 1 - cfg.model.pipeline_model_parallel_size = 1 - cfg.model.pipeline_dtype = torch.bfloat16 - cfg.model.virtual_pipeline_model_parallel_size = None - cfg.model.context_parallel_size = 1 - cfg.model.sequence_parallel = True - cfg.model.expert_tensor_parallel_size = 1 - cfg.model.expert_model_parallel_size = 1 - cfg.model.pipeline_model_parallel_layout = None - cfg.model.seq_length = 2048 - - # Training-specific model overrides - cfg.model.apply_rope_fusion = False - cfg.model.attention_backend = "fused" - cfg.model.gradient_accumulation_fusion = True - cfg.model.init_method_std = 0.014 - cfg.model.use_fused_weighted_squared_relu = True - cfg.model.calculate_per_token_loss = True - - # MoE Token Dispatcher Settings - cfg.model.moe_token_dispatcher_type = "alltoall" - cfg.model.moe_shared_expert_overlap = False - cfg.model.moe_flex_dispatcher_backend = "hybridep" - - # CUDA Graph disabled — packed-sequence SFT passes explicit attention masks that - # are incompatible with CUDA graph capture/replay in Mamba layers. - cfg.model.cuda_graph_impl = "none" - cfg.model.cuda_graph_scope = [] - - # MTP Settings (HF config has num_nextn_predict_layers=1 for the shared block; - # mtp_num_layers=2 controls forward-pass repetitions with mtp_use_repeated_layer) - cfg.model.mtp_num_layers = 2 - cfg.model.keep_mtp_spec_in_bf16 = True - cfg.model.mtp_loss_scaling_factor = 0.3 - cfg.model.mtp_use_repeated_layer = True - cfg.model.use_te_rng_tracker = True - - # PEFT config - Nemotron uses Mamba-specific target modules - mamba_target_modules = ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"] - if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: - cfg.peft = default_peft_config(peft_scheme, target_modules=mamba_target_modules) - elif isinstance(peft_scheme, PEFT): - cfg.peft = peft_scheme - else: - cfg.peft = LoRA( - target_modules=mamba_target_modules, - dim=32, - alpha=32, - dropout=0.0, - dropout_position="pre", - lora_A_init_method="xavier", - lora_B_init_method="zero", - ) - - # Optimizer overrides - cfg.optimizer.lr = 1e-4 - cfg.optimizer.adam_beta1 = 0.9 - cfg.optimizer.adam_beta2 = 0.95 - cfg.optimizer.adam_eps = 1e-8 - cfg.optimizer.weight_decay = 0.1 - cfg.scheduler.start_weight_decay = 0.1 - cfg.scheduler.end_weight_decay = 0.1 - cfg.scheduler.lr_decay_style = "cosine" - - # Tokenizer - cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - - # Checkpoint config overrides - cfg.checkpoint.save_interval = 200 - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.dist_ckpt_strictness = "log_all" - cfg.checkpoint.ckpt_assume_constant_structure = True - cfg.checkpoint.async_save = True - - # Logger config - cfg.logger.log_interval = 10 - - # RNG config - cfg.rng.seed = 1234 - - # DDP config - cfg.ddp.check_for_nan_in_grad = True - cfg.ddp.grad_reduce_in_fp32 = True - cfg.ddp.overlap_grad_reduce = True - cfg.ddp.overlap_param_gather = True - cfg.ddp.use_distributed_optimizer = True - - return cfg - - -__all__ = [ - "nemotron_3_super_pretrain_config", - "nemotron_3_super_sft_config", - "nemotron_3_super_peft_config", -] - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/cuda-graphs.md -```md -# CUDA Graphs - -CUDA graphs capture a sequence of GPU operations once and replay them with -minimal host overhead, reducing repeated kernel-launch and driver costs on -every training step. - -This page is the stable guide for what CUDA graphs are, when they help, and -what tradeoffs to expect. For exact enablement knobs, code anchors, and -verification commands, see `skills/perf-techniques/cuda-graphs/SKILL.md`. - -## What It Is - -CUDA graphs record a fixed sequence of GPU work during a capture phase and then -replay that sequence on later steps. The main benefit is lower host-side -launch overhead. - -Megatron Bridge supports two capture implementations: - -| `cuda_graph_impl` | Mechanism | Scope support | -|---|---|---| -| `"local"` | MCore `CudaGraphManager` / `FullCudaGraphWrapper` | `full_iteration` | -| `"transformer_engine"` | TE `make_graphed_callables()` per layer | `attn`, `mlp`, `moe`, `moe_router`, `moe_preprocess`, `mamba` | -| `"none"` (default) | Disabled | — | - -`"local"` captures the whole forward-backward iteration. `"transformer_engine"` -captures selected submodules and is usually the more flexible default path. - -## What Problem It Solves - -CUDA graphs mainly solve launch-bound training steps where GPU compute is fast -enough that repeated host-driver submission overhead becomes noticeable. - -This is most useful when: - -- tensor shapes are static across steps -- the workload has high step frequency or relatively small kernels -- the run has enough memory headroom to keep graph buffers resident - -It is less about changing the math and more about reducing runtime overhead. - -## Impacted Training Dimensions - -| Dimension | Effect | Confidence | Why | -|---|---|---|---| -| `speed` | ~15-30% faster step time | medium | Replays pre-captured GPU work and reduces launch overhead. Measured 16-24% on GPT-OSS-20B and 22% on Qwen3-30B-A3B with TE-scoped graphs. Gain depends on how launch-bound the workload is. | -| `memory` | ~0-2 GB extra (TE scoped); 10 GB+ possible with `PP > 1` or large MoE | high | Graph buffers stay allocated for replay. TE-scoped showed no measurable increase on 20B/30B models but OOM'd on 120B at 70/79 GB. | -| `scale` | neutral to slightly positive | low | Can help at scale if launch overhead matters, but memory overhead can gate larger configs (e.g., GPT-OSS-120B OOM). | -| `convergence` | no change expected | medium | Intended to preserve training math when capture constraints are satisfied. Loss matched within 0.001 on Qwen3-30B-A3B over 20 iterations. | -| `stability` | adds operational constraints | medium | Requires static shapes, specific RNG/NaN settings, and compatible scope selections. Failure modes are well-defined but add surface area. | - -## When to Use It - -Enable CUDA graphs when all of the following are mostly true: - -- sequence length and micro-batch size are static -- host overhead is a meaningful part of step time -- the run has spare memory budget -- you want throughput improvement without changing the training objective - -As a rule of thumb: - -- prefer `transformer_engine` scoped graphs for the safer first rollout -- use `local` `full_iteration` graphs only when you specifically want the - largest launch-overhead reduction and can accept the stricter constraints - -## When Not to Use It - -Avoid CUDA graphs when any of these are true: - -- sequence length or batch shapes vary step to step -- CPU offloading is enabled -- memory is already tight, especially with `PP > 1` -- you rely on runtime checks that conflict with `full_iteration` capture -- you need unsupported scope combinations for MoE or recompute paths -- SFT/LoRA with packed sequences (`packed_sequence=True`) — TE-scoped graphs - cannot capture `packed_seq_params` (non-Tensor input) -- full activation recompute (`recompute_granularity=full`) with TE-scoped - graphs — only `local` full-iteration graphs support full recompute - -## Feature Interactions - -The most important interactions are: - -- `use_te_rng_tracker` and `rng.te_rng_tracker`: required when CUDA graphs are enabled -- `rerun_state_machine.check_for_nan_in_loss`: must be disabled for `local` + `full_iteration` -- MoE routing scopes: `moe` and `moe_router` are mutually exclusive -- `moe_preprocess`: requires `moe_router` -- `delay_wgrad_compute`: adds extra constraints when captured scopes include attention or MoE router -- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`: requires `NCCL_GRAPH_REGISTER=0` in the relevant path -- CPU offloading: incompatible - -These interactions are stable enough to treat as design constraints, not just -debugging tips. - -## Bridge Configuration - -Minimal high-level configuration: - -```python -cfg.model.cuda_graph_impl = "transformer_engine" # or "local" -cfg.model.cuda_graph_scope = ["attn"] # or other valid scopes -cfg.model.cuda_graph_warmup_steps = 3 -cfg.model.use_te_rng_tracker = True -cfg.rng.te_rng_tracker = True -``` - -If you use `local` + `full_iteration`, also disable: - -```python -cfg.rerun_state_machine.check_for_nan_in_loss = False -cfg.ddp.check_for_nan_in_grad = False -``` - -## Minimal Runnable Example - -For a minimal Bridge-facing example, start from the functional smoke test: - -- `tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py` - -For a lightweight CLI-driven path, use the performance harness with scoped -capture and a small model recipe. - -## Expected Metric Changes - -| Metric | Expected Change | Conditions | Evidence | -|---|---|---|---| -| `step_time` | ~15-25% down | Static shapes, MoE, TE scoped (`attn+moe_router+moe_preprocess`) | measured: Qwen3-30B-A3B 623→484ms; GPT-OSS-20B 467-520→391-399ms | -| `tokens_per_sec` | ~20-33% up | Same as above | measured: Qwen3-30B-A3B 214→274 TFLOP/s/GPU; GPT-OSS-20B 37.9-42.2→49.4-50.4 | -| `peak_memory` | same pre-capture | TE scoped graphs on H100 80GB | measured: no increase in allocated memory on Qwen3-30B-A3B and GPT-OSS-20B | -| `OOM risk` | up | Tight memory budget or large MoE configs | measured: GPT-OSS-120B blocked at ~70/79 GB before capture | - -Do not assume a fixed throughput gain across models. The improvement depends on -how launch-bound the workload is and how much scope is captured. - -## Measured Results (Qwen3-30B-A3B MoE, H100, TP2 PP2 EP4, 2 nodes) - -### Pretrain - -TE-scoped CUDA graphs (`attn + moe_router + moe_preprocess`) on Qwen3-30B-A3B -with mock data, GBS=8, MBS=1: - -- **~22% faster** iteration time (484ms vs 623ms steady-state) -- **~28% higher TFLOP/s** (274 vs 214 TFLOP/s/GPU) -- **Loss matches** baseline within 0.001 across all 20 iterations -- 24 graphable layers per pipeline rank, capture completes in ~5.6s -- No memory increase pre-capture, no NCCL errors - -### SFT (packed sequences) - -SFT with packed sequences (`packed_sequence=True`, SQuAD dataset) hits a -hard incompatibility: - -``` -AssertionError: CUDA graph accepts only Tensor inputs. -inference_context and packed_seq_params are excluded from input list. -``` - -TE-scoped CUDA graphs require all forward inputs to be Tensors. Packed -sequence SFT passes `packed_seq_params` (a dataclass), which is not captured. -The baseline SFT runs fine without graphs (~880ms/iter). - -Workarounds: disable packing, or use `local` full-iteration graphs. Also make -sure the TE/container build actually supports the packed-sequence attention -backend your recipe needs. - -## Additional Validation (GPT-OSS, H100, Mar 2026) - -### GPT-OSS-20B pretrain - -TE-scoped CUDA graphs on `gpt-oss-20b` with `TP2 PP4 EP4 CP1`, 2 nodes, and -mock data: - -- capture succeeds with 6 graphable layers per pipeline rank; capture completes - in ~0.95s -- steady-state iteration time improves by ~16-24% (467-520ms to 391-399ms) -- throughput improves by ~19-33% (37.9-42.2 to 49.4-50.4 TFLOP/s/GPU) -- the pre-capture memory report is unchanged and the 20-iteration run completes - without NCCL or illegal-memory-access errors -- loss comparison is inconclusive: the first ~10 post-capture iterations are - close, but the run used mock data, `GBS=4`, and a production LR, so later - divergence is too noisy to treat as a correctness signal - -A cleaner loss-match pass should lower LR and/or raise GBS before drawing -equivalence conclusions. - -### GPT-OSS-20B SFT and LoRA - -Both packed-sequence finetuning workloads were blocked in the -`mbridge-260128.sqsh` container before any CUDA-graph-specific behavior could -be isolated: - -- baseline and graphed runs both fail with no TE attention backend available - for the packed-sequence path -- treat this as an environment/container blocker first, not as proof that CUDA - graphs are or are not the root cause -- after upgrading TE/container support, these workloads still need separate - validation because packed-sequence plus TE-scoped graphs remains a sensitive - combination - -### GPT-OSS-120B pretrain - -`gpt-oss-120b` pretrain at `TP2 PP4 EP8`, 4 nodes, hits OOM on iteration 2: - -- iteration 1 already uses ~69-70 GB allocated and ~72-73 GB reserved on 79 GB - H100s -- the failure is a `torch.OutOfMemoryError` on an additional 1.54 GiB - allocation -- treat larger MoE rollouts as memory-gated even before capture benefits are - realized; more PP or different memory settings may be needed - -## Common Failure Modes - -- Missing TE RNG tracker settings causes an assertion before training starts. -- Dynamic sequence or batch shapes break capture or replay assumptions. -- `local` `full_iteration` graphs fail when NaN-loss checking is still enabled. -- Illegal scope combinations such as `moe` with `moe_router` fail validation. -- Runs that fit in eager mode can OOM after enabling graphs because buffers stay pinned. -- Full activation recompute (`recompute_granularity=full`) with TE-scoped graphs - asserts: `full recompute is only supported with full iteration CUDA graph`. - Disable recompute or switch to `local` implementation. -- Packed-sequence SFT/LoRA asserts: `CUDA graph accepts only Tensor inputs. - inference_context and packed_seq_params are excluded from input list.` - TE-scoped graphs cannot capture non-Tensor forward arguments. -- Older TE/container builds can fail packed-sequence attention before graph - capture begins (`Available backends = {FlashAttention=False, - FusedAttention=False, UnfusedDotProductAttention=False}`). In that case the - baseline and graph runs are both blocked, so fix the environment first. - -## Related Docs - -- [Performance Guide](../performance-guide.md) -- [Communication Overlap](communication-overlap.md) -- `skills/perf-techniques/cuda-graphs/SKILL.md` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/checkpointing.md -```md -# Checkpointing - -The {py:class}`bridge.training.config.CheckpointConfig` controls model checkpointing behavior, including saving and loading checkpoints, checkpoint formats, and various optimization features. - -```{Note} -This documentation covers **Megatron-format checkpoints** used during training. For converting between 🤗 Hugging Face and Megatron formats, see the {doc}`../bridge-guide`. -``` - -## Overview - -Megatron Bridge uses Megatron Core's distributed checkpointing system, which is designed for large-scale training across multiple GPUs and nodes. The distributed checkpoint approach saves the state of a distributed training job by sharding checkpoint data across multiple files, reducing memory overhead and improving GPU utilization during save/load operations. - -### Distributed Checkpointing Benefits - -**Memory Efficiency**: Instead of gathering all model parameters and optimizer states on a single rank, distributed checkpointing saves data directly from each rank, significantly reducing memory requirements during checkpointing. - -**Parallelism Flexibility**: The system provides flexibility to resume training using different parallelism strategies. You can change tensor parallelism, pipeline parallelism, or data parallelism sizes between checkpoint save and load operations. - -**Scalability**: Handles all types of parallelism including: -- **Data Parallelism (DP)**: Replicates the model across multiple GPUs with different data batches -- **Tensor Parallelism (TP)**: Distributes individual layer parameters across GPUs -- **Pipeline Parallelism (PP)**: Assigns consecutive layers to different GPUs -- **Context Parallelism (CP)**: Shards tensors along the sequence dimension for long sequences -- **Expert Parallelism (EP)**: Distributes MoE expert weights across GPUs - -**Performance**: The distributed optimizer shards optimizer states and master parameters across data-parallel ranks instead of replicating them, reducing memory usage and communication overhead. - - -## Save Configuration - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `save` | `Optional[str]` | `None` | Output directory to save checkpoints to **in Megatron format** | -| `save_interval` | `Optional[int]` | `None` | Number of iterations between persistent checkpoint saves | -| `save_optim` | `bool` | `True` | Whether to save optimizer state | -| `save_rng` | `bool` | `True` | Whether to save random number generator state | -| `save_tokenizer_assets` | `bool` | `True` | Whether to save tokenizer files (vocab, config, special tokens) to checkpoint | - -### Asynchronous Saving - -Asynchronous saving allows training to continue while checkpoint data is persisted to disk in the background, reducing the impact of checkpointing on training throughput. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `async_save` | `bool` | `False` | Enable asynchronous checkpoint saving (requires `torch_dist` format) | - -## Load Configuration - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `load` | `Optional[str]` | `None` | Directory containing a model checkpoint to load **in Megatron format** | -| `load_optim` | `bool` | `True` | Whether to load optimizer state from checkpoint | -| `load_rng` | `bool` | `True` | Whether to load random number generator state from checkpoint | -| `load_main_params_from_ckpt` | `bool` | `False` | Load main parameters from checkpoint (use with `load_optim=False`) | -| `ckpt_step` | `Optional[int]` | `None` | Specific checkpoint iteration to load (overrides latest from tracker) | -| `exit_on_missing_checkpoint` | `bool` | `False` | Exit if specified checkpoint is not found instead of random initialization | -| `dist_ckpt_strictness` | `Literal[...]` | `"assume_ok_unexpected"` | Handling of key mismatches during distributed checkpoint load | - -### Loading Specific Checkpoint Iterations - -By default, Megatron Bridge loads the **latest checkpoint** available in the specified directory by reading from the tracker file (`latest_train_state.pt`). However, you can explicitly load from a specific checkpoint iteration using the `ckpt_step` parameter. - -**Python API:** -```python -from megatron.bridge.training.config import CheckpointConfig - -# Load latest checkpoint -checkpoint = CheckpointConfig( - load="/path/to/checkpoint_dir" -) - -# Load specific iteration -checkpoint = CheckpointConfig( - load="/path/to/checkpoint_dir", - ckpt_step=5000 # Overrides tracker, loads iter_0005000 -) -``` - -```{note} -The `load` parameter should always point to the base checkpoint directory (not the `iter_N` subdirectory). The `ckpt_step` parameter overrides which iteration is loaded from that directory. - -**Important:** If `ckpt_step` is specified but the checkpoint directory does not exist, training will **fail immediately** with a `FileNotFoundError`. This is intentional to prevent accidentally starting training from scratch when you meant to resume from a specific checkpoint. - -**PEFT Note:** The `ckpt_step` parameter applies **only to the `load` path** (adapter checkpoints), not to `pretrained_checkpoint` (frozen base model). When resuming PEFT training: -- `pretrained_checkpoint`: Always loads the latest/release checkpoint (base model) -- `load` + `ckpt_step`: Can load a specific adapter checkpoint iteration - - -### Checkpoint Loading Strictness - -When loading distributed checkpoints, there may be mismatches between the keys in the saved checkpoint and what the current model expects. This can happen when resuming training with different parallelism settings, model configurations, or software versions. The `dist_ckpt_strictness` parameter controls how these mismatches are handled: - -- **`assume_ok_unexpected`**: Assume unexpected keys are acceptable (default, most permissive) -- **`log_unexpected`**: Log unexpected keys but continue loading -- **`log_all`**: Log all key mismatches for debugging -- **`raise_unexpected`**: Raise error on unexpected keys (stricter validation) -- **`raise_all`**: Raise error on any key mismatch (strictest validation) -- **`return_unexpected`**: Return information about unexpected keys -- **`return_all`**: Return information about all key mismatches -- **`ignore_all`**: Ignore all key mismatches completely - -## Fine-tuning Configuration - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `pretrained_checkpoint` | `Optional[str]` | `None` | Directory containing pretrained model checkpoint **in Megatron format** for fine-tuning | - -## Checkpoint Format - -Megatron Bridge supports multiple checkpoint formats optimized for different use cases: - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `ckpt_format` | `Literal["torch_dist", "zarr", "fsdp_dtensor"]` | `"torch_dist"` | Checkpoint format to use | - -### Available Formats - -**`torch_dist`** (Default) -- PyTorch distributed checkpoint format -- Compatible with most parallelism strategies (DP, TP, PP, CP, EP) -- Supports asynchronous saving when `async_save=True` -- Recommended for general use - -**`zarr`** -- Zarr-based checkpoint format -- Alternative to `torch_dist` for certain use cases -- Compatible with distributed parallelism strategies - -**`fsdp_dtensor`** -- Specialized format for Megatron FSDP (Fully Sharded Data Parallel) -- **Required when using `use_megatron_fsdp=True`** -- Optimized for sharded parameter layouts -- Not compatible with other FSDP implementations - -### Format Selection - -Choose your checkpoint format based on your training configuration: - -```python -from megatron.bridge.training.config import CheckpointConfig - -# Standard distributed training (DDP, TP, PP) -checkpoint = CheckpointConfig( - ckpt_format="torch_dist", # Default, works for most cases - save="/path/to/checkpoints", -) - -# Megatron FSDP training -checkpoint = CheckpointConfig( - ckpt_format="fsdp_dtensor", # Required for FSDP - save="/path/to/checkpoints", -) -``` - -### Format Compatibility - -| Format | DDP | Distributed Optimizer | Megatron FSDP | Torch FSDP2 | Async Save | -|--------|-----|----------------------|---------------|-------------|------------| -| `torch_dist` | ✅ | ✅ | ❌ | ✅ | ✅ | -| `zarr` | ✅ | ✅ | ❌ | ✅ | ❌ | -| `fsdp_dtensor` | ❌ | ❌ | ✅ | ❌ | ❌ | - -**Important**: When using Megatron FSDP (`use_megatron_fsdp=True`), you must set `ckpt_format="fsdp_dtensor"`. Other formats are not compatible with FSDP's sharded parameter layout. See {doc}`megatron-fsdp` for complete FSDP configuration details. - -## Performance Optimizations - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `fully_parallel_save` | `bool` | `True` | Apply full save parallelization across data parallel ranks | -| `fully_parallel_load` | `bool` | `False` | Apply full load parallelization across data parallel ranks | -| `ckpt_assume_constant_structure` | `bool` | `False` | Assume constant model/optimizer structure over successive checkpoint saves for performance optimizations | - - -## Checkpoint Contents - -The checkpoint includes the following components when using the `torch_dist` checkpoint format: -- **Model parameters and optimizer states**: Stored across `.distcp` files to support distributed training. -- **Training state**: Captures the current iteration count, number of consumed samples, and the state of the learning rate scheduler. -- **Configuration**: Serialized as a YAML file (`run_config.yaml`) containing the complete `ConfigContainer`. -- **Tokenizer files**: All tokenizer artifacts (vocabulary, special tokens, config) for self-contained checkpoints. -- **Dataloader states**: Ensures deterministic resumption of data iteration. -- **Metadata**: Used for validating and correctly loading the checkpoint. - -Megatron Bridge creates checkpoints with the following directory structure: - -``` -checkpoint_dir/ -├── latest_train_state.pt # Latest training state (top-level) -├── iter_N/ # Checkpoint at iteration N -│ ├── __0_0.distcp # Distributed checkpoint shards: maps to PyTorch DCP weights format -│ ├── __0_1.distcp # Contains model parameters, optimizer states -│ ├── __1_0.distcp -│ ├── __1_1.distcp -│ ├── ... -│ ├── .metadata # PyTorch DCP checkpoint metadata -│ ├── common.pt # MCore dist ckpt states saved from rank 0 -│ ├── metadata.json # MCore dist ckpt metadata -│ ├── run_config.yaml # Serialized ConfigContainer -│ ├── train_state.pt # Number of steps, consumed samples, etc -│ ├── tokenizer/ # Tokenizer files (saved by default) -│ │ ├── tokenizer.json # Full tokenizer vocabulary -│ │ ├── tokenizer_config.json # Tokenizer configuration -│ │ ├── special_tokens_map.json # Special token definitions -│ │ └── ... # Other tokenizer artifacts -│ ├── dataloader_state/ # Data iterator states -│ │ ├── train_dataloader_dprank000.pt # DP rank 0 dataloader state -│ │ ├── train_dataloader_dprank001.pt # DP rank 1 dataloader state -│ │ ├── train_dataloader_dprank002.pt # DP rank 2 dataloader state -│ │ └── ... # One file per DP rank -``` - -### Tokenizer Assets - -By default, Megatron Bridge saves all tokenizer files to the checkpoint directory, making checkpoints self-contained and portable. This is particularly important for: -- **Inference and evaluation**: Direct access to tokenizer for computing logprobs -- **Portability**: No dependency on original tokenizer file locations -- **Reproducibility**: Exact tokenizer state is preserved - -The tokenizer files saved depend on the tokenizer type: -- **HuggingFace tokenizers**: `tokenizer.json`, `tokenizer_config.json`, `special_tokens_map.json`, and vocab files -- **SentencePiece tokenizers**: `tokenizer.model` file -- **GPT2 BPE tokenizers**: `vocab.json` and `merges.txt` -- **BERT tokenizers**: `vocab.txt` -- **Tiktoken tokenizers**: `tokenizer.json` - -To disable tokenizer asset saving for performance-sensitive scenarios: - -```python -from megatron.bridge.training.config import CheckpointConfig - -checkpoint = CheckpointConfig( - save_tokenizer_assets=False, # Skip tokenizer file saving - ... -) -``` - -Or in YAML: - -```yaml -checkpoint: - save_tokenizer_assets: false -``` - -## Local Checkpointing - -Local checkpointing saves model checkpoints directly to storage on each node (e.g., local SSDs or RAM disks), instead of relying solely on a shared network filesystem. This approach can significantly speed up the saving process and reduce the load on shared storage infrastructure. - -Local checkpointing leverages the [NVIDIA Resiliency Extension](https://nvidia.github.io/nvidia-resiliency-ext/checkpointing/local/index.html) and provides several key features: - -- **Local Saving**: Each node saves its part of the checkpoint locally, reducing network I/O and improving save performance. -- **Synchronous and Asynchronous Support**: Saving can happen synchronously or asynchronously, mirroring the configuration used for global checkpoints. -- **Automatic Cleanup**: Handles the removal of outdated or incomplete local checkpoints automatically. -- **Optional Replication**: For multi-node jobs, checkpoints are replicated to other nodes to allow recovery even if a node fails after saving. Single-node jobs do not use replication. -- **Automated Loading**: When resuming, the framework automatically finds the latest valid checkpoint, comparing local and global checkpoints, and retrieves any needed parts across nodes. -### Non-Persistent Checkpointing Configuration - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `non_persistent_save_interval` | `Optional[int]` | `None` | Iterations between non-persistent saves | -| `non_persistent_ckpt_type` | `Optional[Literal["global", "local", "in_memory", "None"]]` | `None` | Type of non-persistent checkpointing | -| `non_persistent_global_ckpt_dir` | `Optional[str]` | `None` | Directory for global non-persistent checkpoints | -| `non_persistent_local_ckpt_dir` | `Optional[str]` | `None` | Directory for local non-persistent checkpoints | -| `non_persistent_local_ckpt_algo` | `Literal["fully_parallel", "atomic"]` | `"fully_parallel"` | Algorithm for local non-persistent checkpointing | - -### Replication and Fault Tolerance - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `replication` | `bool` | `False` | Enable replication of local checkpoints across ranks | -| `replication_jump` | `Optional[int]` | `None` | Spacing between ranks storing replicas | -| `replication_factor` | `int` | `2` | Number of machines storing replica of each rank's data | - -### Checkpointing Distributed Optimizer - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `dist_ckpt_optim_fully_reshardable` | `bool` | `False` | Make optimizer distributed checkpoint fully reshardable (TP/PP/EP/DP) as opposed to plain DP reshardability | -| `distrib_optim_fully_reshardable_mem_efficient` | `bool` | `False` | Use as little memory as possible during save and load by using Gloo. Has affect only with `dist_ckpt_optim_fully_reshardable` flag | - -## Custom Checkpoint Manager - -For advanced use cases, you can provide a custom checkpoint manager implementation to override the default save/load behavior. This enables integration with custom storage backends, alternative checkpoint formats, or organization-specific checkpointing workflows. - -### Configuration - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `custom_manager_class` | `str \| None` | `None` | Fully qualified class name for a custom `CheckpointManager` implementation | - -### Usage - -Specify a custom checkpoint manager class in your configuration: - -**YAML:** -```yaml -checkpoint: - save: /path/to/checkpoints - custom_manager_class: "mypackage.checkpoint.MyCheckpointManager" -``` - -**Python:** -```python -from megatron.bridge.training.config import CheckpointConfig - -checkpoint = CheckpointConfig( - save="/path/to/checkpoints", - custom_manager_class="mypackage.checkpoint.MyCheckpointManager", -) -``` - -### Implementing a Custom Manager - -Your custom manager must implement the `CheckpointManager` protocol defined in `megatron.bridge.training.checkpointing`: - -```python -from megatron.bridge.training.checkpointing import ( - CheckpointManager, - CheckpointSaveContext, - CheckpointLoadContext, - save_checkpoint, - load_checkpoint, - init_checkpointing_context, -) -from megatron.bridge.training.config import CheckpointConfig -from megatron.bridge.training.state import GlobalState - - -class MyCheckpointManager: - """Custom checkpoint manager example.""" - - def __init__(self, checkpoint_config: CheckpointConfig) -> None: - self.checkpoint_config = checkpoint_config - # Initialize internal context for caching strategies - self._context = init_checkpointing_context(checkpoint_config) - - def save(self, ctx: CheckpointSaveContext) -> None: - """Save a checkpoint with custom logic.""" - # Option 1: Completely custom implementation - # my_custom_save(ctx.state, ctx.model, ...) - - # Option 2: Wrap the default implementation - save_checkpoint( - state=ctx.state, - model=ctx.model, - optimizer=ctx.optimizer, - opt_param_scheduler=ctx.opt_param_scheduler, - num_floating_point_operations_so_far=ctx.num_floating_point_operations_so_far, - checkpointing_context=self._context, - non_persistent_ckpt=ctx.non_persistent_ckpt, - train_data_iterator=ctx.train_data_iterator, - ) - # Add custom post-processing (e.g., upload to cloud) - upload_to_s3(ctx.state.cfg.checkpoint.save) - - def load(self, ctx: CheckpointLoadContext) -> tuple[int, int]: - """Load a checkpoint with custom logic.""" - # Returns (iteration, num_floating_point_operations_so_far) - return load_checkpoint( - state=ctx.state, - model=ctx.model, - optimizer=ctx.optimizer, - opt_param_scheduler=ctx.opt_param_scheduler, - strict=ctx.strict, - checkpointing_context=self._context, - skip_load_to_model_and_opt=ctx.skip_load_to_model_and_opt, - ) - - def finalize_async_saves( - self, state: GlobalState, blocking: bool = False, terminate: bool = False - ) -> None: - """Finalize any pending asynchronous saves.""" - from megatron.bridge.training.checkpointing import maybe_finalize_async_save - - maybe_finalize_async_save( - global_state=state, - ckpt_cfg=self.checkpoint_config, - blocking=blocking, - terminate=terminate, - ) -``` - -### Context Dataclasses - -The save and load methods receive context dataclasses that bundle all required parameters: - -**`CheckpointSaveContext`:** -| Field | Type | Description | -|-------|------|-------------| -| `state` | `GlobalState` | Global training state (config, train_state, loggers) | -| `model` | `list[MegatronModule]` | Model modules to save | -| `optimizer` | `MegatronOptimizer \| None` | Optimizer instance | -| `opt_param_scheduler` | `Any \| None` | Learning rate scheduler | -| `num_floating_point_operations_so_far` | `int` | Cumulative FLOPs | -| `train_data_iterator` | `Any \| None` | Data iterator (optional) | -| `non_persistent_ckpt` | `bool` | Whether this is a non-persistent checkpoint | - -**`CheckpointLoadContext`:** -| Field | Type | Description | -|-------|------|-------------| -| `state` | `GlobalState` | Global training state | -| `model` | `list[MegatronModule]` | Model modules to load into | -| `optimizer` | `MegatronOptimizer \| None` | Optimizer instance | -| `opt_param_scheduler` | `Any \| None` | Learning rate scheduler | -| `strict` | `bool` | Enforce strict loading (default: `True`) | -| `skip_load_to_model_and_opt` | `bool` | Skip loading into model/optimizer (default: `False`) | - -### Limitations - -The custom checkpoint manager is designed for customizing the save/load **operations** during training. The following limitations apply: - -**Checkpoint format compatibility**: Custom managers that change the checkpoint directory structure or metadata files (e.g., `latest_train_state.pt`, `run_config.yaml`) are not well supported. Many utilities in Megatron Bridge assume the standard Megatron checkpoint format. For instance, HuggingFace ↔ custom format conversion is not supported. - -**PEFT with custom checkpoints**: When using PEFT (Parameter-Efficient Fine-Tuning), the `pretrained_checkpoint` path must point to a Megatron-format checkpoint. The custom manager only applies to the training save/load flow (the `save` and `load` configuration paths), not to base model loading for PEFT. - -**Inference loading**: Loading checkpoints for inference via `model_load_save.py` utilities is undefined behavior with custom checkpoint formats. Use your custom format's loading utilities instead. - -### Default Behavior - -When `custom_manager_class` is not set, Megatron Bridge uses `DefaultCheckpointManager`, which wraps the existing `save_checkpoint` and `load_checkpoint` functions. This ensures full backward compatibility—the checkpoint manager abstraction introduces no changes to existing training workflows. - -## Related Documentation - -- {doc}`megatron-fsdp` - Megatron FSDP configuration and `fsdp_dtensor` format requirements -- {doc}`../parallelisms` - Understanding data and model parallelism strategies -- {doc}`config-container-overview` - Complete configuration reference - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/gpt_step.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from functools import partial -from typing import Iterable - -import modelopt.torch.distill as mtd -import torch -from megatron.core import parallel_state -from megatron.core.models.gpt import GPTModel -from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage -from megatron.core.utils import ( - get_batch_on_this_cp_rank, - get_model_config, - is_te_min_version, - unwrap_model, -) - -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.losses import masked_next_token_loss -from megatron.bridge.training.post_training.distillation import loss_func_kd -from megatron.bridge.training.state import GlobalState -from megatron.bridge.training.utils.packed_seq_utils import get_packed_seq_params -from megatron.bridge.training.utils.pg_utils import get_pg_collection - - -logger = logging.getLogger(__name__) - - -def _partition_packed_batch_for_cp(batch: dict[str, torch.Tensor], cp_size: int) -> dict[str, torch.Tensor]: - """Partition THD/packed batches across context-parallel ranks. - - Uses transformer_engine's `thd_get_partitioned_indices` to slice sequence - dimension aligned with packed cu_seqlens. This avoids the generic - `get_batch_on_this_cp_rank` slicing which assumes contiguous sequence tokens. - """ - - err_msg = "Please update Transformer Engine to >= 1.10 to use Context Parallel with THD format data" - try: - import transformer_engine_torch as tex - - if not is_te_min_version("1.10.0"): - logger.error(err_msg) - raise RuntimeError(err_msg) - except ModuleNotFoundError as e: - logger.error(err_msg) - raise e - - cp_rank = parallel_state.get_context_parallel_rank() - cu_seqlens = batch["cu_seqlens"] - if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1: - raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)") - cu_seqlens = cu_seqlens.squeeze() - cu_seqlens_unpadded = batch.get("cu_seqlens_unpadded") - if cu_seqlens_unpadded is not None: - batch["cu_seqlens_unpadded"] = cu_seqlens_unpadded.squeeze() - - skip_keys = { - "cu_seqlens", - "cu_seqlens_unpadded", - "cu_seqlens_argmin", - "cu_seqlens_unpadded_argmin", - "max_seqlen", - "token_count", - } - - for key, val in batch.items(): - if val is None or key in skip_keys: - continue - index = tex.thd_get_partitioned_indices(cu_seqlens, val.size(1), cp_size, cp_rank) - batch[key] = val.index_select(1, index) - - return batch - - -def get_batch_from_iterator( - data_iterator: Iterable, - use_mtp: bool = False, - skip_getting_attention_mask_from_dataset: bool = True, - *, - is_first_pp_stage: bool, - is_last_pp_stage: bool, -) -> dict[str, torch.Tensor]: - """Get a batch of data from the iterator. - - Args: - data_iterator: The data iterator to get the batch from. - use_mtp: Whether Multi-Token Prediction layers are enabled. - skip_getting_attention_mask_from_dataset: If set, the dataset will pass a None attention mask. - - Returns: - dict[str, torch.Tensor]: A dictionary containing the batch data. - """ - batch = next(data_iterator) - - required_device_keys = set() - required_host_keys = set() - - if not skip_getting_attention_mask_from_dataset: - required_device_keys.add("attention_mask") - - if "cu_seqlens" in batch: - required_device_keys.add("cu_seqlens") - if "cu_seqlens_unpadded" in batch: - required_device_keys.add("cu_seqlens_unpadded") - required_host_keys.add("cu_seqlens_argmin") - required_host_keys.add("max_seqlen") - if "cu_seqlens_unpadded_argmin" in batch: - required_host_keys.add("cu_seqlens_unpadded_argmin") - - if is_first_pp_stage or use_mtp: - required_device_keys.update(("tokens", "position_ids")) - if is_last_pp_stage: - required_device_keys.update(("labels", "loss_mask")) - - _batch_required_keys = {} - for key, val in batch.items(): - if key in required_device_keys: - _batch_required_keys[key] = val.cuda(non_blocking=True) if val is not None else None - elif key in required_host_keys: - _batch_required_keys[key] = val.cpu() if val is not None else None - else: - _batch_required_keys[key] = None - - return _batch_required_keys - - -def get_batch( - data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False, *, pg_collection -) -> tuple[ - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor | None, - torch.Tensor | None, -]: - """Generate a batch. - - Args: - data_iterator: Input data iterator - cfg: Configuration container - use_mtp: Whether Multi-Token Prediction layers are enabled - - Returns: - tuple of tensors containing tokens, labels, loss_mask, attention_mask, position_ids, - cu_seqlens, cu_seqlens_argmin, max_seqlen, cu_seqlens_unpadded, and - cu_seqlens_unpadded_argmin - """ - # Determine pipeline stage role via process group collection - is_first = is_pp_first_stage(pg_collection.pp) - is_last = is_pp_last_stage(pg_collection.pp) - if (not is_first) and (not is_last): - return None, None, None, None, None, None, None, None, None, None - - batch = get_batch_from_iterator( - data_iterator, - use_mtp, - getattr(cfg.dataset, "skip_getting_attention_mask_from_dataset", True), - is_first_pp_stage=is_first, - is_last_pp_stage=is_last, - ) - - cp_size = pg_collection.cp.size() - has_packed = batch.get("cu_seqlens") is not None - if has_packed and cp_size > 1: - batch = _partition_packed_batch_for_cp(batch, cp_size) - else: - # slice batch along sequence dimension for context parallelism - batch = get_batch_on_this_cp_rank(batch, cp_group=pg_collection.cp) - - return ( - batch["tokens"], - batch["labels"], - batch["loss_mask"], - batch.get( - "attention_mask" - ), # Attention_mask is optional for pre-training as a casual mask is generated automatically. - batch["position_ids"], - batch.get("cu_seqlens"), - batch.get("cu_seqlens_argmin"), - batch.get("max_seqlen"), - batch.get("cu_seqlens_unpadded"), - batch.get("cu_seqlens_unpadded_argmin"), - ) - - -def _forward_step_common( - state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False -) -> tuple[torch.Tensor, torch.Tensor]: - """Forward training step. - - Args: - state: Global state for the run - data_iterator: Input data iterator - model: The GPT Model - return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor - - Returns: - tuple containing the output tensor and loss mask - """ - timers = state.timers - straggler_timer = state.straggler_timer - - config = get_model_config(model) - pg_collection = get_pg_collection(model) - use_mtp = (getattr(config, "mtp_num_layers", None) or 0) > 0 - - timers("batch-generator", log_level=2).start() - with straggler_timer(bdata=True): - ( - tokens, - labels, - loss_mask, - attention_mask, - position_ids, - cu_seqlens, - cu_seqlens_argmin, - max_seqlen, - cu_seqlens_unpadded, - cu_seqlens_unpadded_argmin, - ) = get_batch(data_iterator, state.cfg, use_mtp, pg_collection=pg_collection) - timers("batch-generator").stop() - - forward_args = { - "input_ids": tokens, - "position_ids": position_ids, - "attention_mask": attention_mask, - "labels": labels, - } - - # Add packed sequence support - if cu_seqlens is not None: - packed_seq_params = { - "cu_seqlens": cu_seqlens, - "cu_seqlens_argmin": cu_seqlens_argmin, - "max_seqlen": max_seqlen, - "cu_seqlens_unpadded": cu_seqlens_unpadded, - "cu_seqlens_unpadded_argmin": cu_seqlens_unpadded_argmin, - } - forward_args["packed_seq_params"] = get_packed_seq_params(packed_seq_params) - - with straggler_timer: - if return_schedule_plan: - assert config.overlap_moe_expert_parallel_comm, ( - "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan" - ) - schedule_plan = model.build_schedule_plan( - tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask - ) - return schedule_plan, loss_mask - else: - output_tensor = model(**forward_args) - - return output_tensor, loss_mask - - -def forward_step( - state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False -) -> tuple[torch.Tensor, partial]: - """Forward training step. - - Args: - state: Global state for the run - data_iterator: Input data iterator - model: The GPT Model - return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor - - Returns: - tuple containing the output tensor and the loss function - """ - output, loss_mask = _forward_step_common(state, data_iterator, model, return_schedule_plan) - - loss_function = _create_loss_function( - loss_mask, - check_for_nan_in_loss=state.cfg.rerun_state_machine.check_for_nan_in_loss, - check_for_spiky_loss=state.cfg.rerun_state_machine.check_for_spiky_loss, - ) - - return output, loss_function - - -def _create_loss_function(loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial: - """Create a partial loss function with the specified configuration. - - Args: - loss_mask: Used to mask out some portions of the loss - check_for_nan_in_loss: Whether to check for NaN values in the loss - check_for_spiky_loss: Whether to check for spiky loss values - - Returns: - A partial function that can be called with output_tensor to compute the loss - """ - return partial( - masked_next_token_loss, - loss_mask, - check_for_nan_in_loss=check_for_nan_in_loss, - check_for_spiky_loss=check_for_spiky_loss, - ) - - -def forward_step_modelopt( - state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False -) -> tuple[torch.Tensor, partial]: - """Forward training step with ModelOpt required modifications. - - Args: - state: Global state for the run - data_iterator: Input data iterator - model: The GPT Model - return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor - - Returns: - tuple containing the output tensor and the loss function - """ - output, loss_mask = _forward_step_common(state, data_iterator, model, return_schedule_plan) - - loss_function = _create_loss_function_modelopt( - loss_mask, - model, - check_for_nan_in_loss=state.cfg.rerun_state_machine.check_for_nan_in_loss, - check_for_spiky_loss=state.cfg.rerun_state_machine.check_for_spiky_loss, - ) - - return output, loss_function - - -def _create_loss_function_modelopt( - loss_mask: torch.Tensor, model: GPTModel, check_for_nan_in_loss: bool, check_for_spiky_loss: bool -) -> partial: - """Create a partial loss function with the specified configuration. - - Kept here for backward compatibility with tests and callers that patch - `megatron.bridge.training.gpt_step.masked_next_token_loss`. - - Args: - loss_mask: Used to mask out some portions of the loss - model: The GPT Model - check_for_nan_in_loss: Whether to check for NaN values in the loss - check_for_spiky_loss: Whether to check for spiky loss values - - Returns: - A partial function that can be called with output_tensor to compute the loss - """ - mnt_loss_func = partial( - masked_next_token_loss, - loss_mask, - check_for_nan_in_loss=check_for_nan_in_loss, - check_for_spiky_loss=check_for_spiky_loss, - ) - unwrapped_model = unwrap_model(model) - if isinstance(unwrapped_model, mtd.DistillationModel): - return partial(loss_func_kd, loss_mask=loss_mask, original_loss_fn=mnt_loss_func, model=unwrapped_model) - else: - return mnt_loss_func - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/pretrain.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch.distributed as dist -from nvidia_resiliency_ext.inprocess import CallWrapper - -from megatron.bridge.data.utils import get_dataset_provider -from megatron.bridge.training.callbacks import Callback, CallbackManager, normalize_callbacks -from megatron.bridge.training.config import ConfigContainer, runtime_config_update -from megatron.bridge.training.eval import evaluate_and_print_results -from megatron.bridge.training.forward_step_func_types import ForwardStepCallable -from megatron.bridge.training.setup import setup -from megatron.bridge.training.state import GlobalState -from megatron.bridge.training.train import _finish_train, train -from megatron.bridge.training.utils.log_utils import barrier_and_log -from megatron.bridge.utils.common_utils import print_rank_0 -from megatron.bridge.utils.decorators import experimental_fn - - -@experimental_fn -def pretrain( - config: ConfigContainer, - forward_step_func: ForwardStepCallable, - callbacks: list[Callback] | CallbackManager | None = None, -) -> None: - """Main function to run the training pipeline. - - Sets up the environment, model, optimizer, scheduler, and data iterators. - Performs training, validation, and optionally testing based on the provided - configuration. - - Args: - config: The main configuration container holding all necessary parameters. - forward_step_func: A callable (function or functor) that performs a single - forward and backward step, returning the loss and any computed - metrics. Supports the following signatures: - - 2 args: (data_iterator, model) - - 3 args: (data_iterator, model, return_schedule_plan=False) - OR (state: GlobalState, data_iterator, model) - - 4 args: (state: GlobalState, data_iterator, model, return_schedule_plan=False) - callbacks: Optional callbacks for custom logic injection. Can be: - - list[Callback]: List of Callback subclass instances - - CallbackManager: Pre-configured manager with registered callbacks - - None: No callbacks (default) - - Note: - Use the signature with GlobalState type hint for full access to configuration, timers, and training state. - State injection is automatic based on type hints or parameter names. - Functors (classes with __call__) are fully supported. - - Warnings: - This is an experimental API and is subject to change in backwards - incompatible ways without notice. - """ - # Apply runtime config updates prior to creating/attaching GlobalState - runtime_config_update(config) - - # Create a single GlobalState instance regardless of restart path - state = GlobalState() - state.cfg = config - - # Normalize callbacks to CallbackManager - callback_manager = normalize_callbacks(callbacks) - - if config.inprocess_restart and config.inprocess_restart.enabled: - if dist.is_initialized(): - raise RuntimeError( - "In-process restart is incompatible with user-initialized process groups. " - "The in-process restart mechanism expects to manage the process group lifecycle " - "and will destroy it during fault recovery. Either:\n" - "1. Disable in-process restart and manage the process group yourself, or\n" - "2. Let the framework initialize the process group by not calling " - "torch.distributed.init_process_group() before training." - ) - - # Apply in-process restart wrapper directly to _pretrain - from megatron.bridge.training.inprocess_restart import maybe_wrap_for_inprocess_restart - - # Wrap _pretrain directly and get the store; state is captured for abort - wrapped_pretrain, store = maybe_wrap_for_inprocess_restart(_pretrain, config.inprocess_restart, state) - - # Execute the wrapped function - nvidia-resiliency-ext will inject inprocess_call_wrapper - # Call with positional args matching the adapter signature: (state, forward_step_func, store=None, inprocess_call_wrapper=None) - wrapped_pretrain(state, forward_step_func, callback_manager, store=store) - else: - # Normal execution without in-process restart - _pretrain(state=state, forward_step_func=forward_step_func, callback_manager=callback_manager) - - -def _pretrain( - state: GlobalState, - forward_step_func: ForwardStepCallable, - callback_manager: CallbackManager | None = None, - store: dist.Store | None = None, - inprocess_call_wrapper: CallWrapper | None = None, -) -> None: - """Internal function containing the actual pretrain logic. - - Args: - state: Global training state containing the validated configuration and runtime objects - forward_step_func: Function or functor that performs a single forward/backward step - callback_manager: Optional CallbackManager for custom callback execution - store: Optional distributed Store used by in-process restart for coordination - inprocess_call_wrapper: Optional wrapper injected by nvrx to expose restart iteration - """ - # Determine whether the training loop will initialize the process group - # If the trainer creates the process group, the trainer should destroy it before returning control back to the user - should_destroy_process_group = not dist.is_initialized() - - # Handle in-process restart store prefix - if inprocess_call_wrapper is not None: - restart_attempt = inprocess_call_wrapper.iteration - store = dist.PrefixStore(str(restart_attempt), store) - - config = state.cfg - dataset_provider = get_dataset_provider(config.dataset) - setup_output = setup(state, dataset_provider, restart_store=store, callback_manager=callback_manager) - state = setup_output.state - model = setup_output.model - optimizer = setup_output.optimizer - scheduler = setup_output.scheduler - train_data_iterator = setup_output.train_data_iterator - valid_data_iterator = setup_output.valid_data_iterator - test_data_iterator = setup_output.test_data_iterator - checkpoint_manager = setup_output.checkpoint_manager - pg_collection = setup_output.pg_collection - - # TRAINING - if not config.validation.skip_train: - if state.train_state.do_train and config.train.train_iters > 0: - train( - forward_step_func, - model, - optimizer, - scheduler, - train_data_iterator, - valid_data_iterator, - state, - checkpoint_manager, - pg_collection, - callback_manager=callback_manager, - ) - - barrier_and_log("after training is done") - - else: - print_rank_0("skipping training ...") - - iteration = state.train_state.step - - # VALIDATION - if state.train_state.do_valid: - prefix = f"iteration {iteration} on validation set" - evaluate_and_print_results( - state, - prefix, - forward_step_func, - valid_data_iterator, - model, - config.model, - verbose=True, - write_to_tensorboard=not config.validation.skip_train, - callback_manager=callback_manager, - ) - if state.train_state.do_test: - prefix = f"iteration {iteration} on test set" - evaluate_and_print_results( - state, - prefix, - forward_step_func, - test_data_iterator, - model, - config.model, - verbose=True, - write_to_tensorboard=not config.validation.skip_train, - callback_manager=callback_manager, - is_test=True, - ) - - _finish_train(state, checkpoint_manager) - _maybe_destroy_process_group(should_destroy_process_group) - - -def _maybe_destroy_process_group(should_destroy: bool) -> None: - """Destroy the process group if it was created by this training session. - - Args: - should_destroy: Whether the process group should be destroyed - """ - if should_destroy and dist.is_initialized(): - dist.barrier() - dist.destroy_process_group() - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/pp_comm_overlap.png -```png -[Binary file] -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/pruning.md -```md -# Pruning - -Pruning reduces model size by removing redundant parameters (e.g., shrinking hidden dimensions or layers) while preserving accuracy. In Megatron Bridge, pruning is provided by [NVIDIA Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer) using the Minitron algorithm for GPT and Mamba-based models loaded from HuggingFace. - -## Pre-requisites - -Running the pruning example requires Megatron-Bridge and Model-Optimizer dependencies. We recommend using the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.02`). To use the latest ModelOpt scripts, mount your Model-Optimizer repo to the container. - -```bash -export MODELOPT_DIR=${PWD}/Model-Optimizer # or set to your local Model-Optimizer repository path if you have cloned it -if [ ! -d "${MODELOPT_DIR}" ]; then - git clone https://github.com/NVIDIA/Model-Optimizer.git ${MODELOPT_DIR} -fi - -export DOCKER_IMAGE=nvcr.io/nvidia/nemo:26.02 -docker run \ - --gpus all \ - --shm-size=20g \ - --net=host \ - --ulimit memlock=-1 \ - --rm -it \ - -v ${MODELOPT_DIR}:/opt/Model-Optimizer \ - -v ${MODELOPT_DIR}/modelopt:/opt/venv/lib/python3.12/site-packages/modelopt \ - -w /opt/Model-Optimizer/examples/megatron_bridge \ - ${DOCKER_IMAGE} bash -``` - -Once inside the container, you need to login with your HuggingFace token to download gated datasets / models. -Note that the default dataset for pruning is [`nemotron-post-training-dataset-v2`](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2), which is gated. - -```bash -huggingface-cli login --token -``` - -## Usage - -### Prune to a target parameter count (using Neural Architecture Search) - -Example: prune Qwen3-8B to 6B on 2 GPUs (Pipeline Parallelism = 2), skipping pruning of `num_attention_heads`. Defaults: 1024 samples from [nemotron-post-training-dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) for calibration, at most 20% depth (`num_layers`) and 40% width per prunable hyperparameter (`hidden_size`, `ffn_hidden_size`, ...), top-10 candidates evaluated for MMLU (5% sampled data) to select the best model. - -```bash -torchrun --nproc_per_node 2 prune_minitron.py \ - --pp_size 2 \ - --hf_model_name_or_path Qwen/Qwen3-8B \ - --prune_target_params 6e9 \ - --hparams_to_skip num_attention_heads \ - --output_hf_path /tmp/Qwen3-8B-Pruned-6B -``` - -### Prune to a specific architecture (using manual configuration) - -Example: prune Qwen3-8B to a fixed architecture. Defaults: 1024 samples from [nemotron-post-training-dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) for calibration. - -```bash -torchrun --nproc_per_node 2 prune_minitron.py \ - --pp_size 2 \ - --hf_model_name_or_path Qwen/Qwen3-8B \ - --prune_export_config '{"hidden_size": 3584, "ffn_hidden_size": 9216}' \ - --output_hf_path /tmp/Qwen3-8B-Pruned-6B-manual -``` - -To see the full list of options for advanced configurations, run: - -```bash -torchrun --nproc_per_node 1 prune_minitron.py --help -``` - -### Uneven pipeline parallelism - -If the number of layers is not divisible by the number of GPUs (pipeline parallel size), set `--num_layers_in_first_pipeline_stage` and `--num_layers_in_last_pipeline_stage`. For example, Qwen3-8B with 36 layers on 8 GPUs: set both to 3 to get 3-5-5-5-5-5-5-3 layers per GPU. - -## More information - -For more details, see the [ModelOpt pruning README](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#readme). - -## Next steps: Knowledge Distillation - -Knowledge Distillation is required to recover the performance of the pruned model. See the [Knowledge Distillation](distillation.md) guide for more details. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/resiliency.md -```md -# Resiliency - -Megatron Bridge incorporates resilient training features from the -[NVIDIA Resiliency Extension](https://github.com/NVIDIA/nvidia-resiliency-ext). -This extension provides fault-tolerant capabilities that help minimize downtime -due to failures and interruptions during training. - -This page is the stable overview for what each resiliency feature is, when to -use it, and which constraints are durable. For operational setup, config knobs, -parameter tables, code anchors, and verification commands, see [skills/resiliency/SKILL.md](../skills/resiliency/SKILL.md). - -## What It Is - -| Feature | Purpose | Maturity | Cluster | -|---|---|---|---| -| Fault tolerance | Hang detection + automatic job restart | Production | Slurm only | -| NVRx straggler detection | Identify slow GPUs | Production | Any | -| Preemption | Graceful shutdown before time limit | Production | Slurm only | -| Async checkpoint save | Non-blocking checkpoint writes | Production | Any | -| Local checkpointing | Fast local save with replication | Production | Any | -| Re-run state machine | NaN / spiky loss attribution | Experimental | Any | -| In-process restart | Restart within the same process | Experimental | Any | - -## Fault Tolerance - -The fault tolerance feature detects hangs during training and automatically -restarts the workload. It uses section-based monitoring with different timeout -thresholds for setup, training steps, and checkpointing operations. - -### When to Use It - -Fault tolerance is a good fit when: - -- training on unreliable hardware or at very large scale -- transient faults (network glitches, GPU errors) are common -- you want automatic recovery without manual intervention - -### Stable Constraints - -- Requires Slurm and `ft_launcher` (not `torchrun`) -- Checkpoint directory must be configured and accessible -- Uses `nvidia-resiliency-ext` RankMonitorClient -- Not compatible with NSys profiling - -The system supports both in-job restarts (within the same Slurm allocation) and -new job launches on failure, with configurable limits for each. - -## Straggler Detection - -NVRx straggler detection monitors GPU performance across ranks and identifies -slow-performing nodes. It calculates both relative and individual performance -scores, and can optionally terminate training if performance falls below -configurable thresholds. - -### When to Use It - -Straggler detection is useful when: - -- training at scale where one slow node degrades overall throughput -- you want visibility into per-rank GPU performance -- you need to identify persistent hardware issues - -### Stable Constraints - -- Requires `nvidia-resiliency-ext` -- Overhead is minimal but can be tuned via `profiling_interval` -- Does **not** stop training by default; `stop_if_detected` must be - explicitly set to `True` for automatic termination - -## Preemption - -Preemption handling provides graceful shutdown when a training job receives a -termination signal (default: SIGTERM). It saves a checkpoint before exiting to -preserve training progress. - -### When to Use It - -Preemption is important when: - -- running on shared clusters with job time limits -- higher-priority jobs may preempt your allocation -- you want to minimize lost work on job termination - -### Stable Constraints - -- The `PreemptionPlugin` is Slurm-specific -- Direct configuration via `exit_signal_handler` works on any cluster -- Signal detection happens at the end of each training step - -## Async Checkpoint Save - -Async checkpoint save overlaps checkpoint I/O with training compute using -persistent background workers. Training continues immediately after scheduling -the save rather than blocking until the write completes. - -### When to Use It - -Async save is valuable when: - -- checkpoint save time is a significant fraction of step time -- you are using `torch_dist` checkpoint format - -### Stable Constraints - -- Requires `ckpt_format="torch_dist"` -- Other formats (zarr, fsdp_dtensor) do not support async save -- The persistent checkpoint worker must be enabled - -## Local Checkpointing - -Local checkpointing saves checkpoint data to node-local storage first, then -replicates across a configurable number of nodes. This avoids the latency of -writing to shared network storage during the critical path. - -### When to Use It - -Local checkpointing is useful when: - -- shared-storage checkpoint writes are the bottleneck in your checkpoint interval -- you want faster recovery from node failures without depending on network filesystem availability -- training at scale where network-storage contention is common - -### Stable Constraints - -- Node-local storage must have sufficient capacity for at least one checkpoint -- Replication degree must be configured to survive the expected failure rate -- Requires compatible checkpoint format (see [skills/resiliency/SKILL.md](../skills/resiliency/SKILL.md)) - -## Re-run State Machine - -The re-run state machine is an experimental feature for attributing unexpected -results (NaN loss, spiky loss) to transient errors, persistent hardware faults, -or correct-but-unexpected results. It works by re-running computations on the -same and different GPUs. - -### When to Use It - -Consider the re-run state machine when: - -- you need automated NaN detection and attribution -- you want to distinguish hardware faults from training instability - -### Stable Constraints - -- Alpha-level feature; full integration is limited -- Three modes: `disabled`, `validate_results`, `report_determinism_stats` -- Uses specific exit codes (16, 17) to control job behavior - -## In-Process Restart - -In-process restart provides automatic fault recovery by restarting the training -function within the same OS process. This avoids the overhead of launching new -jobs, starting containers, and creating new CUDA contexts. - -### When to Use It - -In-process restart is appropriate when: - -- software faults (exceptions, deadlocks) are more common than hardware faults -- restart latency matters and you want to avoid full job relaunch -- you can accept the experimental status and compatibility constraints - -### Stable Constraints - -- Requires PyTorch >= 2.5.1 and NCCL >= 2.26.2 -- Not compatible with NeMo-Run or Slurm preemption plugins -- Requires specific environment variables (`NCCL_NVLS_ENABLE=0`, etc.) -- The PyTorch NCCL watchdog timeout must exceed `hard_timeout` -- Supports both node-level and rank-level restart granularity - -In-process restart is not suitable for hardware-level failures such as switch -failures or network partitions. For comprehensive fault tolerance, combine it -with job-level fault tolerance. - -## Practical Caveats - -1. No single resiliency feature covers all failure modes. The recommended - approach is to layer features (e.g., fault tolerance + straggler detection + - async checkpoint). -2. Not all recipes enable resiliency features by default. Check and enable - explicitly. -3. Two straggler detectors exist in the codebase (NVRx and legacy MCore). - Use the NVRx version; do not enable both. - -## Related Docs - -- [docs/training/checkpointing.md](checkpointing.md) -- [docs/performance-guide.md](../performance-guide.md) -- [skills/resiliency/SKILL.md](../skills/resiliency/SKILL.md) -- [skills/resiliency/card.yaml](../skills/resiliency/card.yaml) -- [NVIDIA Resiliency Extension](https://github.com/NVIDIA/nvidia-resiliency-ext) -- [In-Process Restart Guide](https://nvidia.github.io/nvidia-resiliency-ext/inprocess/index.html) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/activation-recomputation.md -```md -# Activation Recomputation - -The input activations of network layers are stored in device memory and are used to compute gradients during back-propagation. When training a LLM with a long sequence length or a large micro-batch size, these input activations can quickly saturate device memory. Checkpointing a few activations and recomputing the rest is a common technique to reduce device memory usage. - -Activation recomputation in Megatron Bridge is configured through the model provider's recomputation parameters, which are based on Megatron Core's `TransformerConfig`. - -## Transformer Layer Recomputation - -Megatron Bridge supports transformer layer recomputation, which checkpoints the input of each transformer layer and recomputes the activations for the remaining layers. This technique significantly reduces activation memory usage. However, it increases the per-transformer layer computation cost by 30% due to re-executing the entire layer's forward computation. - -Megatron Bridge also supports partial transformer layer recomputation, which is beneficial when recomputing a few transformer layers helps to reduce enough GPU memory for the model to fit. This approach avoids the need to recompute the rest of the layers. - -### Configuration - -Transformer layer recomputation is configured through the model provider's recomputation parameters: - -```python -from megatron.bridge.models import GPTModelProvider - -# Full recomputation - recompute all layers -model_config = GPTModelProvider( - recompute_granularity="full", # Enable full layer recomputation - recompute_method="uniform", # Uniform distribution across layers - recompute_num_layers=4, # Number of layers per recomputation block - # ... other model parameters -) -``` - -### Recomputation Methods - -#### Block Method -Recomputes a specific number of transformer layers per pipeline stage: - -```python -model_config = GPTModelProvider( - recompute_granularity="full", - recompute_method="block", # Block-wise recomputation - recompute_num_layers=4, # Recompute 4 layers per pipeline stage -) -``` - -#### Uniform Method -Uniformly divides the total number of transformer layers and recomputes input activations for each divided chunk: - -```python -model_config = GPTModelProvider( - recompute_granularity="full", - recompute_method="uniform", # Uniform distribution - recompute_num_layers=8, # Number of layers per recomputation block -) -``` - -### Pipeline Parallelism Considerations - -When training with pipeline parallelism: -- `recompute_num_layers` indicates the layers per pipeline stage -- When using virtual pipelining, `recompute_num_layers` specifies the number of layers per virtual pipeline stage -- The framework automatically handles recomputation coordination across pipeline stages - -![Activation Recomputation Methods](images/activation-recomputation-example-1.jpg) -*Figure 1: Scheme of uniform and block checkpointing method (full checkpointing granularity)* - -## Self-attention Recomputation - -Megatron Bridge supports selective self-attention recomputation that checkpoints the inputs of each self-attention block and recomputes the intermediate input activations. This cost-efficient method achieves high memory savings with minimal recomputation cost. - -The intermediate layers of the self-attention block account for the majority of the activation memory because the input sizes of softmax, dropout, and QKV dot-product attention layers have memory complexity proportional to the sequence length squared. However, their recomputation cost is relatively smaller than other linear projection layers that scale with the hidden size squared. - -![Activation Recomputation Granularity](images/activation-recomputation-example-2.jpg) -*Figure 2: Scheme of full and selective checkpointing granularity* - -### Configuration - -Self-attention recomputation is enabled using selective granularity: - -```python -from megatron.bridge.models import GPTModelProvider - -model_config = GPTModelProvider( - recompute_granularity="selective", # Enable selective recomputation - recompute_modules=["core_attn"], # Recompute attention modules (default) - # ... other model parameters -) -``` - -### Recomputation Modules - -Megatron Bridge supports selective recomputation for various modules: - -```python -model_config = GPTModelProvider( - recompute_granularity="selective", - recompute_modules=[ - "core_attn", # Core attention computation (default) - "mlp", # MLP layers - "layernorm", # Layer normalization - "moe", # Mixture of Experts layers - "moe_act", # MoE activation functions - "shared_experts", # Shared expert layers - "mla_up_proj", # Multi-Latent Attention up projection - ], -) -``` - -### Flash Attention Integration - -Self-attention recomputation is automatically enabled when using Flash Attention through Transformer Engine. Flash Attention inherently provides memory efficiency by recomputing attention scores rather than storing them, making additional explicit recomputation often unnecessary. - -## Advanced Recomputation Configuration - -### Distributed Activation Checkpointing - -For models using model parallelism, you can distribute saved activations across the model parallel group: - -```python -model_config = GPTModelProvider( - recompute_granularity="selective", - distribute_saved_activations=True, # Distribute across model parallel group - # Note: Cannot be used with sequence_parallel=True -) -``` - -### Memory vs Computation Trade-offs - -Different recomputation strategies offer different memory-computation trade-offs: - -- **Selective recomputation**: Provides high memory savings with minimal recomputation cost by targeting memory-intensive operations like attention -- **Full recomputation**: Significantly reduces activation memory usage but increases per-transformer layer computation cost by approximately 30% -- **No recomputation**: Preserves all activations in memory, requiring more GPU memory but no additional computation - -### MoE-Specific Recomputation - -For Mixture of Experts models, specialized recomputation options are available: - -```python -model_config = GPTModelProvider( - # MoE configuration - num_moe_experts=8, - expert_model_parallel_size=2, - - # MoE recomputation - recompute_granularity="selective", - recompute_modules=["moe", "moe_act"], # Recompute MoE-specific modules -) -``` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/config-container-overview.md -```md -# Configuration Overview - -The `ConfigContainer` is the central configuration object in Megatron Bridge that holds all settings for training. It acts as a single source of truth that brings together model architecture, training parameters, data loading, optimization, checkpointing, logging, and distributed training settings. - -## What is ConfigContainer - -`ConfigContainer` is a dataclass that holds all the configuration objects needed for training: - -```python -from megatron.bridge.training.config import ConfigContainer - -# ConfigContainer brings together all training configurations -config = ConfigContainer( - model=model_provider, # Model architecture and parallelism - train=training_config, # Training loop parameters - optimizer=optimizer_config, # Megatron Optimization settings - scheduler=scheduler_config, # Learning rate scheduling - dataset=dataset_config, # Data loading configuration - logger=logger_config, # Logging and monitoring - tokenizer=tokenizer_config, # Tokenization settings - checkpoint=checkpoint_config, # Checkpointing and resuming - dist=distributed_config, # Distributed training setup - ddp=ddp_config, # Megatron Distributed Data Parallel settings - # Optional configurations - peft=peft_config, # Parameter-efficient fine-tuning - profiling=profiling_config, # Performance profiling - mixed_precision=mp_config, # Mixed precision training - comm_overlap=comm_overlap_config, # Communication overlap settings - # ... and more -) -``` - -## Configuration Components - -| Component | Purpose | Required | Default | -|-----------|---------|----------|---------| -| `model` | Model architecture and parallelism strategy (GPT, T5, Mamba) | ✅ | - | -| `train` | Training loop parameters (batch sizes, iterations, validation) | ✅ | - | -| `optimizer` | Optimizer type and hyperparameters (from Megatron Core) | ✅ | - | -| `scheduler` | Learning rate and weight decay scheduling | ✅ | - | -| `dataset` | Data loading and preprocessing configuration | ✅ | - | -| `logger` | Logging, TensorBoard, and WandB configuration | ✅ | - | -| `tokenizer` | Tokenizer settings and vocabulary | ✅ | - | -| `checkpoint` | Checkpointing, saving, and loading | ✅ | - | -| `dist` | Distributed training initialization | | `DistributedInitConfig()` | -| `ddp` | Data parallel configuration (from Megatron Core) | | `DistributedDataParallelConfig()` | -| `rng` | Random number generation settings | | `RNGConfig()` | -| `rerun_state_machine` | Result validation and error injection | | `RerunStateMachineConfig()` | -| `mixed_precision` | Mixed precision training settings | | `None` | -| `comm_overlap` | Communication overlap optimizations | | `None` | -| `peft` | Parameter-efficient fine-tuning (LoRA, DoRA, etc.) | | `None` | -| `profiling` | Performance profiling with nsys or PyTorch profiler | | `None` | -| `ft` | Fault tolerance and automatic recovery | | `None` | -| `straggler` | GPU straggler detection | | `None` | -| `nvrx_straggler` | NVIDIA Resiliency Extension straggler detection | | `None` | -| `inprocess_restart` | In-process restart for fault tolerance | | `None` | - -## Design Philosophy - -### **Interoperability with External Config Systems** - -Megatron Bridge's Python configurations are designed to be amenable to other configuration systems you already use, such as: - -- Programmatic configuration: Direct Python object manipulation -- argparse: Command-line arguments can be easily mapped to dataclass fields -- File-based overrides: JSON, YAML, or other config files can override Python configs - -All of these approaches can be translated into Python dataclass instances. The framework provides utilities as a convenience for YAML-based overrides with OmegaConf, but the framework is not tied to any particular configuration system. - -```python -# All of these approaches work seamlessly: - -# 1. Direct Python configuration -config = ConfigContainer( - model=GPTModelProvider(num_layers=24, hidden_size=2048), - train=TrainingConfig(global_batch_size=256, train_iters=10000), - # ... other configs -) - -# 2. YAML-based serialization and deserialization (round-trip) -config.to_yaml("my_config.yaml") -config = ConfigContainer.from_yaml("my_config.yaml") # Load previously saved config - -# 3. Programmatic override after creation -config.train.global_batch_size = 512 # Override after instantiation -config.model.num_layers = 48 # Modify model architecture -``` - -### Centralized Configuration - -Megatron provides extensive flexibility through a rich set of configuration options. The `ConfigContainer` brings all these settings together in a single, organized object. This centralization makes configuration discoverable and maintainable - you have one place to understand and control all aspects of your training run. - -Unlike pure YAML-based configuration systems, `ConfigContainer` provides centralization with the full power of Python. You get the organizational benefits of a single configuration file combined with the programmatic flexibility of Python. - -The configuration system is built using nested dataclasses, providing: - -- **Modularity**: Each config component is independently defined and testable -- **Type safety**: Full static type checking -- **IDE support**: Autocomplete and type hints in development environments -- **Serialization**: Easy conversion to/from YAML, JSON, or other formats -- **Validation**: Built-in field validation - -```python -@dataclass -class ConfigContainer: - model: GPTModelProvider # Dataclass for model architecture - train: TrainingConfig # Dataclass for training parameters - optimizer: OptimizerConfig # Dataclass for optimization settings - # ... nested dataclasses for each concern -``` - -### Lazy Configuration and Deferred Validation - -For training workloads, configurations are lazy to support flexible user workflows: - -**Problem with Eager Validation:** -```python -# This would be problematic with eager validation: -config = TrainingConfig(train_iters=1000) -# __post_init__ calculates dependent values immediately - -config.train_iters = 5000 # User override -# Dependent values are now stale and incorrect! -``` - -**Solution with Lazy Finalization:** -```python -# Megatron Bridge approach - deferred validation -config = TrainingConfig(train_iters=1000) -config.train_iters = 5000 # User can safely override - -# Validation happens automatically right when training starts -pretrain(config, forward_step_func) # All dependent values calculated correctly -``` - -**Benefits:** -- Users can instantiate configs and subsequently override fields safely -- Dependent values are calculated correctly after all user modifications are applied -- Validation happens at the right time, right before training begins -- Flexible configuration workflows are supported - -### **Model Independence** - -Model configurations are designed to be independently usable outside the full training loop provided by thr framework: - -```python -# Models can be used standalone -model_provider = GPTModelProvider( - num_layers=24, - hidden_size=2048, - vocab_size=50000, # Must be explicitly set - seq_length=2048, # Must be explicitly set -) - -# This works independently of other configs -model_provider.finalize() -model = model_provider.provide() -``` - -**Trade-off**: The price for this flexibility is the need to explicitly set values like `seq_length` in multiple places during training. These settings are checked for consistency at the beginning of training. - -## Usage - -```python -# Create and configure -config = ConfigContainer( - model=GPTModelProvider(num_layers=24, seq_length=2048), - train=TrainingConfig(train_iters=1000), - dataset=GPTDatasetConfig(seq_length=2048), # Must match model seq_length - # ... other required configs -) - -# Modify as needed -config.train.train_iters = 5000 -config.model.hidden_size = 4096 - -# Start training - validation happens automatically -pretrain(config, forward_step_func) -``` - -## Configuration Export and Import - -### Export to YAML -```python -# Print YAML configuration to console -config.print_yaml() - -# Save to file -config.to_yaml("config.yaml") -``` - -### Load from YAML -```python -# Load configuration from YAML file -config = ConfigContainer.from_yaml("config.yaml") -``` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/multi-token-prediction.md -```md -# Multi-Token Prediction (MTP) - -## Overview - -Multi-Token Prediction (MTP) is an advanced training technique introduced in the [DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) that enables models to predict multiple future tokens simultaneously during pre-training. Instead of learning to predict only the next token at each position, MTP adds auxiliary prediction heads that predict tokens 2, 3, or more positions ahead. - -### Key Benefits - -- **Densified Training Signals**: Multiple learning signals per training iteration improve data efficiency -- **Pre-Planning Representations**: Models learn internal representations that encode information about future tokens -- **Speculative Decoding Foundation**: MTP-trained models can serve as foundation for faster inference via speculative decoding - -### When to Use MTP - -MTP is most beneficial for: - -- **Large-scale pre-training** (models > 10B parameters) -- **Data-constrained scenarios** where maximizing learning from limited data is critical -- **Training foundation models** intended for downstream fine-tuning or speculative decoding - -MTP is primarily used for pre-training. - -### Additional Resources - -- [DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) - Original paper introducing MTP -- [DeepSeek-V3 GitHub](https://github.com/deepseek-ai/DeepSeek-V3) - Official implementation -- [Megatron Core MTP Guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/user-guide/features/multi_token_prediction.md) - Low-level implementation details - -## Configuration Parameters - -MTP is controlled by two primary parameters: - -| Parameter | Type | Default | Description | Typical Range | -|-----------|------|---------|-------------|---------------| -| `mtp_num_layers` | int | `None` (disabled) | Number of auxiliary prediction depths. Each layer predicts tokens N positions ahead (N=1,2,...,mtp_num_layers). | 1-2 | -| `mtp_loss_scaling_factor` | float | `0.1` | Weight applied to MTP losses relative to main next-token loss. Controls the contribution of auxiliary predictions to the total loss. | 0.05-0.2 | - -### Loss Calculation - -The total training loss combines the main next-token prediction loss with averaged MTP losses: - -``` -total_loss = main_loss + (avg_mtp_loss * mtp_loss_scaling_factor) - -where: - avg_mtp_loss = mean([mtp_1_loss, mtp_2_loss, ..., mtp_N_loss]) -``` - -### Parameter Tuning Guidelines - -**`mtp_num_layers`:** -- Start with `1` for most models (predicts 1 token ahead) -- Use `2` for models > 100B parameters if memory allows -- Higher values increase memory usage and training time proportionally - -**`mtp_loss_scaling_factor`:** -- Default `0.1` works well for most models -- Increase to `0.15-0.2` if MTP losses aren't decreasing -- Decrease to `0.05-0.08` if main loss is being overshadowed -- Scale factor should be proportional to `mtp_num_layers` (more layers → lower factor) - -## Basic Usage: Training from Scratch - -### Minimal Configuration Example - -Here's a minimal example using the Qwen3 30B-A3B recipe with MTP enabled: - -```python -from megatron.bridge.recipes.qwen.qwen3_moe import qwen3_30b_a3b_pretrain_config -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.config import ConfigContainer - -log_dir = "/path/to/log/dir" -cfg: ConfigContainer = qwen3_30b_a3b_pretrain_config() -cfg.logger.tensorboard_dir = log_dir + "/tb_logs" -cfg.checkpoint.save = log_dir + "/checkpoints" -cfg.checkpoint.load = log_dir + "/checkpoints" -# Set up training dataset -cfg.dataset.blend=[[ - f"/path/to/dclm/preprocessed/dclm_{i:02d}_text_document" - for i in range(1, 11) -], None] -cfg.dataset.split="9999,8,2" -cfg.dataset.path_to_cache = "/path/to/cache" -# cfg.model.num_layers = 8 # train a smaller model if OOM -# MTP Configuration -cfg.model.mtp_num_layers = 1 -cfg.model.mtp_loss_scaling_factor = 0.1 -pretrain(cfg, forward_step) -``` -Follow the [DCLM Tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/data/dclm) to prepare the training data - - -## MTP with Pipeline Parallelism - -When using Pipeline Parallelism (PP), **MTP layers must be placed in the last pipeline stage** alongside the loss computation layer. Configure this using custom pipeline layout settings (`pipeline_model_parallel_split_rank`). - -### Pipeline Layout Guidelines - -MTP layers take approximately the same training time as a regular transformer layer. When configuring your pipeline layout: - -- **Place MTP in the last PP stage** (required for correct loss computation) -- **Reduce layers in other PP ranks** to balance computation time across stages -- Example: For a 21-layer model with PP=4 and `mtp_num_layers=1`, you might use splits like `[5, 6, 6, 4]` instead of `[5, 5, 5, 6]` to account for MTP overhead in the last stage - - -## Parallelism Support - -MTP is compatible with all major parallelism strategies in Megatron-Bridge: - -| Parallelism Type | Support Status | Notes | -|------------------|----------------|-------| -| **Tensor Parallelism (TP)** | ✅ Fully Supported | MTP layers are automatically sharded across TP ranks | -| **Pipeline Parallelism (PP)** | ✅ Supported with Constraint | MTP must be in last pipeline stage (see above) | -| **Expert Parallelism (EP)** | ✅ Fully Supported | Works with MoE models (DeepSeek-V3, Mixtral, etc.) | -| **Context Parallelism (CP)** | ✅ Fully Supported | MTP supports long-context training via CP | -| **Data Parallelism (DP)** | ✅ Fully Supported | Standard data parallelism works transparently | - -## Monitoring MTP Training - -### Per-Layer Loss Logging - -During training, you'll see losses for each MTP depth logged separately: - -``` -iteration 100/ 300000 | consumed samples: 3200 | elapsed time per iteration (ms): 3738.6 | learning rate: 6.000000E-05 | global batch size: 32 | lm loss: 7.968678E+00 | load_balancing_loss: 1.329517E+00 | mtp_1 loss: 7.925096E+00 | loss scale: 1.0 | grad norm: 1.040 | number of skipped iterations: 0 | number of nan iterations: 0 | -``` - -### Interpreting Loss Values - -![MTP Loss Curves](../images/mtp_loss.png) - -The figure above shows typical training curves for MTP-enabled training: -- **Left**: MTP auxiliary loss (`mtp_1 loss`) tracking the first additional token prediction -- **Right**: Main language model loss (`lm loss`) for standard next-token prediction - -**Expected Patterns:** - -- **MTP losses are higher than main loss**: Predicting tokens further in the future is inherently harder. In the example above, `mtp_1 loss` (~4.3) is higher than `lm loss` (~3.9) at 3500 iterations. - -- **All losses decrease over training**: Both main and MTP losses should trend downward, as shown in the curves above. - -- **Loss gap remains relatively stable**: The difference between main and MTP losses should not grow significantly over training. - -**Red Flags:** - -- **NaN values**: Indicates training instability (see Troubleshooting section) -- **Diverging losses**: If MTP losses increase while main loss decreases, reduce `mtp_loss_scaling_factor` -- **Widening gap**: If MTP losses fall behind by > 1.0, increase `mtp_loss_scaling_factor` - -**MTP vs Non-MTP Comparison:** - -![MTP Loss Comparison](../images/mtp_loss_comparison.png) - -The figure above compares `lm loss` between MTP-enabled (blue) and non-MTP (red) training runs on Qwen3-30B-A3B. The curves do not differ significantly in the first few thousand iterations. Notably, the MTP-enabled run shows smoother behavior around iterations 1000 and 2300, where the non-MTP run exhibits more pronounced spikes. - -### TensorBoard/WandB Visualization - -MTP losses are automatically logged to TensorBoard and/or WandB. Look for: - -- `lm loss` - Main next-token prediction loss -- `mtp_1 loss` - First auxiliary prediction loss -- `mtp_2 loss` - Second auxiliary prediction loss (if `mtp_num_layers=2`) - -### Training Characteristics - -- MTP adds computational overhead due to additional forward passes -- Memory usage increases proportionally to `mtp_num_layers` -- MTP is designed to improve data efficiency during pre-training - -**Model Performance:** - -- MTP provides additional training signals at each token position -- Can potentially improve downstream task performance -- MTP-trained models can be used for speculative decoding during inference - -## Current Limitations - -The following features are not yet supported with MTP: - -| Feature | Status | Workaround | -|---------|--------|------------| -| **HuggingFace ↔ Megatron Checkpoint Conversion** | ⚠️ Model-specific | Conversion support varies by model; check model-specific documentation | -| **Sequence Packing (Fine-Tuning)** | ❌ Not supported | For pre-training, no issues. For fine-tuning, set `packed_sequence_specs=None` | -| **Cross-Attention** | ❌ Not supported | MTP only works with decoder-only models (GPT, Llama, etc.) | -| **Learned Absolute Position Embeddings** | ❌ Not supported | Use RoPE (rotary position embeddings) or no position embeddings | -| **Block-Based Activation Recomputation** | ❌ Not supported | Use `recompute_granularity="selective"` or `"uniform"` | - -### Important Notes - -**Checkpoint Conversion:** - -HuggingFace ↔ Megatron checkpoint conversion with MTP is model-specific. Some models have conversion support planned, while others may not support MTP parameter mapping. Check the documentation for your specific model. - -**Sequence Packing:** - -MTP is incompatible with fine-tuning sequence packing (e.g., SFT with packed sequences). For pre-training, there are no sequence packing restrictions. - -## Troubleshooting Guide - -### Error: Out of Memory (OOM) - -MTP increases memory usage proportionally to `mtp_num_layers`. Try: -- Reduce `mtp_num_layers` to 1 -- Enable activation recomputation: `recompute_granularity="selective"` -- Increase pipeline parallelism -- Reduce micro batch size - -### Error: MTP Loss is NaN - -Training instability. Try: -- Lower learning rate -- Enable gradient clipping: `clip_grad=1.0` -- Use BF16 instead of FP16 -- Reduce `mtp_loss_scaling_factor` to 0.05 - -### Expected Log: `MTP layers not found on this PP rank` - -This is normal. Only the last pipeline stage builds MTP layers. - -## Additional Resources - -### Code Examples - -- **DeepSeek-V3 Recipe**: [`src/megatron/bridge/recipes/deepseek/deepseek_v3.py`](../../src/megatron/bridge/recipes/deepseek/deepseek_v3.py) - - Example of MTP with large-scale MoE model - - Predefined pipeline layouts for PP + MTP - -- **Qwen3-Next Recipe**: [`src/megatron/bridge/recipes/qwen/qwen3_next.py`](../../src/megatron/bridge/recipes/qwen/qwen3_next.py) - - Clean example of MTP configuration for dense models - - Good starting point for custom recipes - -- **MTP Core Implementation**: [`3rdparty/Megatron-LM/megatron/core/transformer/multi_token_prediction.py`](../../3rdparty/Megatron-LM/megatron/core/transformer/multi_token_prediction.py) - - Low-level MTP layer implementation - - Loss computation and logging helpers - -### Documentation - -- **Megatron Core MTP Guide**: [`3rdparty/Megatron-LM/docs/user-guide/features/multi_token_prediction.md`](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/user-guide/features/multi_token_prediction.md) - - Implementation notes and design decisions - -- **Pipeline Parallelism Guide**: [`docs/parallelisms.md`](../parallelisms.md) - - Understanding pipeline parallelism layouts - - Best practices for PP configuration - -### External Resources - -- **DeepSeek-V3 Technical Report**: [https://arxiv.org/abs/2412.19437](https://arxiv.org/abs/2412.19437) - - Original paper introducing MTP - - Section 3.2: "Multi-Token Prediction" - - Training details and ablation studies - -- **DeepSeek-V3 GitHub**: [https://github.com/deepseek-ai/DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) - - Official implementation and model weights - - Training configurations and hyperparameters - -- **Megatron-LM GitHub**: [https://github.com/NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM) - - Upstream Megatron-Core implementation - - Issues and discussions - -### Getting Help - -If you encounter issues not covered in this guide: - -1. Check the [Megatron-Bridge GitHub Issues](https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues) -2. Review the [Megatron-LM Discussions](https://github.com/NVIDIA/Megatron-LM/discussions) - -When reporting issues, include: -- Full training configuration (recipe and parameters) -- Error messages and stack traces -- GPU type and count -- Megatron-Core version (`pip show megatron-core`) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/super/pretrain_nemotron_3_super.py -```py -#!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import logging -import os -import sys -from typing import Tuple - -import torch -from omegaconf import OmegaConf - -from megatron.bridge.recipes.nemotronh.nemotron_3_super import ( - nemotron_3_super_pretrain_config as pretrain_config, -) -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, - parse_hydra_overrides, -) - - -logger: logging.Logger = logging.getLogger(__name__) - - -def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]: - """Parse command line arguments, separating known script args from OmegaConf overrides.""" - parser = argparse.ArgumentParser( - description="Pretrain Nemotron 3 Super model using Megatron-Bridge with YAML and CLI overrides", - formatter_class=argparse.RawTextHelpFormatter, - ) - parser.add_argument( - "--config-file", - type=str, - help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml", - ) - - # Parse known args for the script, remaining will be treated as overrides - args, cli_dotlist_overrides = parser.parse_known_args() - return args, cli_dotlist_overrides - - -def main() -> None: - """ - Entry point for the Nemotron 3 Super pretraining script. - """ - args, cli_overrides = parse_cli_args() - - cfg: ConfigContainer = pretrain_config() - - # Convert the initial Python dataclass to an OmegaConf DictConfig for merging - merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - - # Load and merge YAML overrides if a config file is provided - if args.config_file: - logger.debug(f"Loading YAML overrides from: {args.config_file}") - if not os.path.exists(args.config_file): - logger.error(f"Override YAML file not found: {args.config_file}") - sys.exit(1) - yaml_overrides_omega = OmegaConf.load(args.config_file) - merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega) - logger.debug("YAML overrides merged successfully.") - - # Apply command-line overrides using Hydra-style parsing - if cli_overrides: - logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}") - merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides) - logger.debug("Hydra-style command-line overrides applied successfully.") - - # Apply the final merged OmegaConf configuration back to the original ConfigContainer - logger.debug("Applying final merged configuration back to Python ConfigContainer...") - final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) - # Apply overrides while preserving excluded fields - apply_overrides(cfg, final_overrides_as_dict, excluded_fields) - - # Start training - logger.debug("Starting pretraining...") - pretrain(config=cfg, forward_step_func=forward_step) - - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - -if __name__ == "__main__": - main() - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/entry-points.md -```md -# Training Entry Points - -Megatron Bridge provides unified training entry points for pretraining, Supervised Fine-Tuning (SFT), and Parameter-Efficient Fine-Tuning (PEFT). All training modes share the same underlying training loop architecture, differing primarily in their data handling and model configuration. - -## Main Entry Points - -The {py:func}`bridge.training.pretrain.pretrain` and {py:func}`bridge.training.finetune.finetune` functions are the primary entry points for pretraining models—either from scratch or through fine-tuning. Each function accepts a {py:class}`bridge.training.config.ConfigContainer` along with a `forward_step_func` that defines how the training loop should be run. - - -## Forward Step Function - -The `forward_step_func` defines how each training step is executed. It should follow this signature: - -```python -def forward_step_func( - global_state: GlobalState, - data_iterator: Iterable, - model: MegatronModule, - return_schedule_plan: bool = False, -) -> tuple[Any, Callable]: - """Forward step function. - - Args: - global_state: Training state object containing configuration and utilities - data_iterator: Iterator over training/evaluation data - model: The model to perform forward step on - return_schedule_plan: Whether to return schedule plan (for MoE overlap) - - Returns: - tuple containing: - - output: Forward pass output (tensor or collection of tensors) - - loss_func: Function to compute loss from the output - """ -``` - -### Responsibilities - -The forward step function has three main responsibilities: - -1. **Get a Batch**: Retrieve and process the next batch from the data iterator. -2. **Run Forward Pass**: Execute the model's forward pass on the batch. -3. **Return Loss Function**: Provide a function to compute loss from the output. - -### State Access - -Megatron Bridge automatically provides the {py:class}`bridge.training.state.GlobalState` object containing: -- **Configuration**: Complete training configuration (`global_state.cfg`). -- **Timers**: Performance monitoring utilities (`global_state.timers`). -- **Training Progress**: Current step, consumed samples (`global_state.train_state`). -- **Loggers**: TensorBoard and WandB loggers for metrics tracking. - -All configuration and state information are accessible through the injected `state` object. - -For complete implementation examples, see {py:func}`bridge.training.gpt_step.forward_step`. - -## Loss Calculation and Reduction - -The loss function returned by the forward step can follow different patterns based on your needs: - -### Loss Function Patterns - -1. **Standard Pattern**: Return `(loss, metadata_dict)` - - The loss is automatically averaged across microbatches - - Metadata dict contains named loss components for logging - - Most common pattern for standard training - -2. **Token-aware Pattern**: Return `(loss, num_tokens, metadata_dict)` - - Loss is averaged across both microbatches and tokens - - Useful when you want per-token loss averaging - - Recommended for variable-length sequences - -3. **Inference Pattern**: Return arbitrary data structures - - Used with `collect_non_loss_data=True` and `forward_only=True` - - Suitable for inference, evaluation metrics, or custom data collection - - No automatic loss processing applied - -### Automatic Loss Processing - -The training loop automatically handles: -- **Microbatch Reduction**: Aggregates losses across all microbatches in the global batch. -- **Distributed Reduction**: Performs all-reduce operations across data parallel ranks. -- **Pipeline Coordination**: Only the last pipeline stage computes and reduces losses. -- **Logging Integration**: Automatically logs loss components to TensorBoard/WandB. - -For implementation details, see {py:func}`bridge.training.train.train_step` and {py:func}`bridge.training.losses.masked_token_loss`, as an example. - -## Customization - -### When to Customize - -You can customize the forward step function when you need: - -- **Custom Loss Functions**: Beyond standard language modeling loss (e.g., adding regularization, multi-objective training). -- **Multi-task Learning**: Training models on multiple tasks simultaneously with different loss components. -- **Custom Data Processing**: Specialized batch preprocessing for domain-specific data formats. -- **Additional Metrics**: Computing extra evaluation metrics during training. -- **Model-specific Logic**: Special handling for custom model architectures or training procedures. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -from megatron.core.activations import squared_relu - -from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider -from megatron.bridge.peft.base import PEFT -from megatron.bridge.peft.lora import LoRA -from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common -from megatron.bridge.recipes.utils.finetune_utils import default_peft_config -from megatron.bridge.training.comm_overlap import CommOverlapConfig -from megatron.bridge.training.config import ConfigContainer - - -def nemotron_3_nano_pretrain_config() -> ConfigContainer: - """Return a pre-training config for Nemotron 3 Nano (30B-A3B MoE). - - This is a MoE (Mixture of Experts) model with the following default parallelism: - - TP=4, PP=1, EP=8, SP=True - - DeepEP enabled for MoE token dispatch - - Returns: - ConfigContainer: Pre-training configuration for Nemotron 3 Nano. - """ - cfg = _pretrain_common() - - # Model Configuration (MoE) - cfg.model = MambaModelProvider( - # Architecture (Nemotron 3 Nano 30B-A3B) - hybrid_layer_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", - num_layers=52, - hidden_size=2688, - mamba_num_heads=64, - kv_channels=128, - mamba_state_dim=128, - ffn_hidden_size=1856, - num_attention_heads=32, - mamba_head_dim=64, - seq_length=8192, - num_query_groups=2, - # MoE - num_moe_experts=128, - moe_ffn_hidden_size=1856, - moe_shared_expert_intermediate_size=3712, - moe_router_topk=6, - moe_router_topk_scaling_factor=2.5, - moe_router_num_groups=1, - moe_router_group_topk=1, - # NemotronH base - mamba_num_groups=8, - make_vocab_size_divisible_by=128, - activation_func=squared_relu, - masked_softmax_fusion=True, - apply_query_key_layer_scaling=False, - persist_layer_norm=True, - attention_softmax_in_fp32=False, - first_last_layers_bf16=True, - is_hybrid_model=True, - moe_aux_loss_coeff=0.0001, - moe_router_score_function="sigmoid", - moe_router_enable_expert_bias=True, - moe_router_load_balancing_type="seq_aux_loss", - moe_router_dtype="fp32", - moe_grouped_gemm=True, - moe_token_dispatcher_type="alltoall", - moe_permute_fusion=True, - moe_shared_expert_overlap=True, - # Parallelism - tensor_model_parallel_size=4, - pipeline_model_parallel_size=1, - pipeline_dtype=torch.bfloat16, - virtual_pipeline_model_parallel_size=None, - context_parallel_size=1, - sequence_parallel=True, - expert_tensor_parallel_size=1, - expert_model_parallel_size=8, - ) - - # Tokenizer (--tokenizer-model) - cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - - # Dataset Configuration - cfg.dataset.seq_length = 8192 - cfg.dataset.blend = None # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)] - cfg.dataset.num_workers = 8 - cfg.dataset.mmap_bin_files = False - - # Parallelism Settings (MoE-specific) - cfg.model.pipeline_model_parallel_layout = None - - # MoE Token Dispatcher Settings - cfg.model.moe_token_dispatcher_type = "flex" - cfg.model.moe_flex_dispatcher_backend = "deepep" - cfg.model.moe_hybridep_num_sms = 16 - - # Training Configuration - cfg.train.train_iters = 39735 - cfg.train.global_batch_size = 3072 - cfg.train.micro_batch_size = 2 - cfg.train.manual_gc = False - cfg.train.manual_gc_interval = 0 - - # Transformer Engine (TE) - cfg.model.transformer_impl = "transformer_engine" - - # CUDA Graph - cfg.model.cuda_graph_impl = "none" - cfg.model.cuda_graph_scope = "full" - cfg.model.cuda_graph_warmup_steps = 3 - - # Kernel Selections - cfg.model.attention_backend = "fused" - cfg.model.moe_router_fusion = False - cfg.model.moe_permute_fusion = True - cfg.model.moe_grouped_gemm = True - cfg.model.cross_entropy_loss_fusion = True - cfg.model.cross_entropy_fusion_impl = "native" - - # Memory Saving (recompute & offloading) - cfg.model.recompute_granularity = None - cfg.model.recompute_modules = None - cfg.model.fine_grained_activation_offloading = False - cfg.model.offload_modules = None - - # ========================================================================= - # FP8 & MXFP8 (Mixed Precision Settings) - # ========================================================================= - # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default - # FP8 settings (disabled by default, uncomment to enable) - # cfg.mixed_precision.fp8_recipe = "tensorwise" - # cfg.mixed_precision.fp8 = None - # cfg.mixed_precision.fp8_param_gather = False - # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False - cfg.model.moe_router_padding_for_fp8 = False - - # Optimizer Precision Settings - cfg.optimizer.use_precision_aware_optimizer = False - cfg.optimizer.main_grads_dtype = torch.float32 - cfg.optimizer.main_params_dtype = torch.float32 - cfg.optimizer.exp_avg_dtype = torch.float32 - cfg.optimizer.exp_avg_sq_dtype = torch.float32 - - # Optimizer hyperparameters - cfg.optimizer.lr = 1.6e-3 - cfg.optimizer.weight_decay = 0.1 - cfg.optimizer.min_lr = 1.6e-5 - cfg.scheduler.lr_warmup_iters = 333 - - # Communication Overlap - cfg.comm_overlap = CommOverlapConfig( - tp_comm_bootstrap_backend="nccl", - tp_comm_overlap=True, - ) - cfg.comm_overlap.delay_wgrad_compute = False - cfg.comm_overlap.overlap_moe_expert_parallel_comm = False - cfg.model.moe_shared_expert_overlap = False - - # Checkpoint Configuration - # Paths are set in _pretrain_common by default. Override here if needed: - # cfg.checkpoint.load = "path/to/load" - # cfg.checkpoint.save = "path/to/save" - cfg.checkpoint.save_interval = 200 - cfg.checkpoint.ckpt_assume_constant_structure = True - cfg.checkpoint.dist_ckpt_strictness = "log_all" - - # DDP Configuration - cfg.ddp.overlap_grad_reduce = True - cfg.ddp.overlap_param_gather = True - cfg.ddp.check_for_nan_in_grad = True - cfg.ddp.use_distributed_optimizer = True - - # MoE Force Load Balancing - cfg.model.moe_router_force_load_balancing = False - - cfg.model.init_method_std = 0.0173 - cfg.model.apply_rope_fusion = False - cfg.model.use_fused_weighted_squared_relu = True - - return cfg - - -# ============================================================================= -# SFT Config -# ============================================================================= - - -def nemotron_3_nano_sft_config() -> ConfigContainer: - """Return a full SFT config for Nemotron 3 Nano (30B-A3B MoE). - - Default parallelism: TP=1, PP=1, EP=8, SP=False - - Returns: - ConfigContainer with all settings pre-configured for Nemotron 3 Nano SFT. - """ - cfg = _sft_common() - - # Model config - Nemotron 3 Nano - cfg.model = MambaModelProvider( - # Architecture (Nemotron 3 Nano 30B-A3B) - hybrid_layer_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", - num_layers=52, - hidden_size=2688, - mamba_num_heads=64, - kv_channels=128, - mamba_state_dim=128, - ffn_hidden_size=1856, - num_attention_heads=32, - mamba_head_dim=64, - seq_length=2048, - num_query_groups=2, - # MoE - num_moe_experts=128, - moe_ffn_hidden_size=1856, - moe_shared_expert_intermediate_size=3712, - moe_router_topk=6, - moe_router_topk_scaling_factor=2.5, - moe_router_num_groups=1, - moe_router_group_topk=1, - # NemotronH base - mamba_num_groups=8, - make_vocab_size_divisible_by=128, - activation_func=squared_relu, - masked_softmax_fusion=True, - apply_query_key_layer_scaling=False, - persist_layer_norm=True, - attention_softmax_in_fp32=False, - first_last_layers_bf16=True, - is_hybrid_model=True, - moe_aux_loss_coeff=0.0001, - moe_router_score_function="sigmoid", - moe_router_enable_expert_bias=True, - moe_router_load_balancing_type="seq_aux_loss", - moe_router_dtype="fp32", - moe_grouped_gemm=True, - moe_token_dispatcher_type="alltoall", - moe_permute_fusion=True, - moe_shared_expert_overlap=True, - # Extra config - apply_rope_fusion=False, - attention_backend="fused", - init_method_std=0.0173, - use_fused_weighted_squared_relu=True, - calculate_per_token_loss=True, - # Parallelism - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, - pipeline_dtype=torch.bfloat16, - virtual_pipeline_model_parallel_size=None, - context_parallel_size=1, - sequence_parallel=False, - expert_tensor_parallel_size=1, - expert_model_parallel_size=8, - ) - - # Parallelism settings - cfg.model.pipeline_model_parallel_layout = None - - # Sequence length - cfg.model.seq_length = 2048 - - # DeePEP settings - set to True to enable DeePEP (enabled by default for Nemotron) - enable_deepep = True - if enable_deepep: - cfg.model.moe_token_dispatcher_type = "flex" - cfg.model.moe_flex_dispatcher_backend = "deepep" - cfg.model.moe_shared_expert_overlap = False - else: - cfg.model.moe_token_dispatcher_type = "alltoall" - cfg.model.moe_flex_dispatcher_backend = None - cfg.model.moe_shared_expert_overlap = True - - cfg.model.moe_hybridep_num_sms = 16 - - # TE (Transformer Engine) - cfg.model.transformer_impl = "transformer_engine" - - # CUDA Graph - cfg.model.cuda_graph_impl = "none" - cfg.model.cuda_graph_scope = "full" - cfg.model.cuda_graph_warmup_steps = 3 - - # Kernel selections - cfg.model.attention_backend = "fused" - cfg.model.moe_router_fusion = False - cfg.model.moe_permute_fusion = True - cfg.model.moe_grouped_gemm = True - cfg.model.cross_entropy_loss_fusion = True - cfg.model.cross_entropy_fusion_impl = "native" - - # Memory saving (recompute & offloading) - cfg.model.recompute_granularity = None - cfg.model.recompute_modules = None - cfg.model.fine_grained_activation_offloading = False - cfg.model.offload_modules = None - - # FP8 & MXFP8 settings - # Note: mixed_precision="bf16_mixed" is set as default - # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default - # cfg.mixed_precision.fp8_recipe = "tensorwise" - # cfg.mixed_precision.fp8 = None - # cfg.mixed_precision.fp8_param_gather = False - # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False - cfg.optimizer.use_precision_aware_optimizer = False - cfg.optimizer.main_grads_dtype = torch.float32 - cfg.optimizer.main_params_dtype = torch.float32 - cfg.optimizer.exp_avg_dtype = torch.float32 - cfg.optimizer.exp_avg_sq_dtype = torch.float32 - cfg.model.moe_router_padding_for_fp8 = False - - # MoE Force Load Balancing - cfg.model.moe_router_force_load_balancing = False - - # Training config overrides - cfg.validation.eval_interval = 500 - - # Dataset config - packed_sequence=True by default (from _sft_common), seq_length=2048 - # _sft_common already sets seq_length=2048 and packed_sequence=True - # Adjust pad_seq_to_mult for context parallelism - if cfg.model.context_parallel_size > 1: - cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2 - - # Optimizer overrides - Nemotron uses specific optimizer settings - cfg.optimizer.adam_beta2 = 0.95 - cfg.optimizer.adam_eps = 1e-8 - cfg.optimizer.weight_decay = 0.1 - cfg.scheduler.start_weight_decay = 0.1 - cfg.scheduler.end_weight_decay = 0.1 - cfg.scheduler.lr_decay_style = "cosine" - - # Tokenizer - cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - - # Checkpoint config overrides - cfg.checkpoint.save_interval = 200 - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.dist_ckpt_strictness = "log_all" - cfg.checkpoint.ckpt_assume_constant_structure = True - # Uncomment below if using a pretrained checkpoint and provide path to the directory containing pretrained model for finetuning - # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" - - # Logger config - cfg.logger.log_interval = 10 - cfg.logger.log_timers_to_tensorboard = False - - # RNG config - Nemotron uses seed 1234 - cfg.rng.seed = 1234 - - # DDP config - cfg.ddp.check_for_nan_in_grad = True - cfg.ddp.grad_reduce_in_fp32 = True - cfg.ddp.overlap_grad_reduce = True - cfg.ddp.overlap_param_gather = True - cfg.ddp.use_distributed_optimizer = True - - # Communication overlap settings(default None, can pass CommOverlapConfig for advanced overlap), uncomment to enable - # cfg.comm_overlap = CommOverlapConfig( - # tp_comm_bootstrap_backend="nccl", - # tp_comm_overlap=True, - # ) - # cfg.comm_overlap.delay_wgrad_compute = False - # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False - - return cfg - - -# ============================================================================= -# PEFT Config -# ============================================================================= - - -def nemotron_3_nano_peft_config( - peft_scheme: str | PEFT = "lora", -) -> ConfigContainer: - """Return a PEFT config for Nemotron 3 Nano (30B-A3B MoE). - - Default parallelism: TP=1, PP=1, EP=8, SP=False - - Args: - peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance. - - Returns: - ConfigContainer with all settings pre-configured for Nemotron 3 Nano PEFT. - """ - cfg = _peft_common() - - # Model config - PEFT uses same parallelism as SFT - cfg.model = MambaModelProvider( - # Architecture (Nemotron 3 Nano 30B-A3B) - hybrid_layer_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME", - num_layers=52, - hidden_size=2688, - mamba_num_heads=64, - kv_channels=128, - mamba_state_dim=128, - ffn_hidden_size=1856, - num_attention_heads=32, - mamba_head_dim=64, - seq_length=2048, - num_query_groups=2, - # MoE - num_moe_experts=128, - moe_ffn_hidden_size=1856, - moe_shared_expert_intermediate_size=3712, - moe_router_topk=6, - moe_router_topk_scaling_factor=2.5, - moe_router_num_groups=1, - moe_router_group_topk=1, - # NemotronH base - mamba_num_groups=8, - make_vocab_size_divisible_by=128, - activation_func=squared_relu, - masked_softmax_fusion=True, - apply_query_key_layer_scaling=False, - persist_layer_norm=True, - attention_softmax_in_fp32=False, - first_last_layers_bf16=True, - is_hybrid_model=True, - moe_aux_loss_coeff=0.0001, - moe_router_score_function="sigmoid", - moe_router_enable_expert_bias=True, - moe_router_load_balancing_type="seq_aux_loss", - moe_router_dtype="fp32", - moe_grouped_gemm=True, - moe_token_dispatcher_type="alltoall", - moe_permute_fusion=True, - moe_shared_expert_overlap=True, - # Extra config - apply_rope_fusion=False, - attention_backend="fused", - init_method_std=0.0173, - use_fused_weighted_squared_relu=True, - calculate_per_token_loss=True, - # Parallelism - tensor_model_parallel_size=1, - pipeline_model_parallel_size=1, - pipeline_dtype=torch.bfloat16, - virtual_pipeline_model_parallel_size=None, - context_parallel_size=1, - sequence_parallel=False, - expert_tensor_parallel_size=1, - expert_model_parallel_size=8, - ) - - # Parallelism settings - cfg.model.pipeline_model_parallel_layout = None - - # Sequence length - cfg.model.seq_length = 2048 - - # DeePEP settings - set to True to enable DeePEP (enabled by default for Nemotron) - enable_deepep = True - if enable_deepep: - cfg.model.moe_token_dispatcher_type = "flex" - cfg.model.moe_flex_dispatcher_backend = "deepep" - cfg.model.moe_shared_expert_overlap = False - else: - cfg.model.moe_token_dispatcher_type = "alltoall" - cfg.model.moe_flex_dispatcher_backend = None - cfg.model.moe_shared_expert_overlap = True - - cfg.model.moe_hybridep_num_sms = 16 - - # TE (Transformer Engine) - cfg.model.transformer_impl = "transformer_engine" - - # CUDA Graph - cfg.model.cuda_graph_impl = "none" - cfg.model.cuda_graph_scope = "full" - cfg.model.cuda_graph_warmup_steps = 3 - - # Kernel selections - cfg.model.attention_backend = "fused" - cfg.model.moe_router_fusion = False - cfg.model.moe_permute_fusion = True - cfg.model.moe_grouped_gemm = True - cfg.model.cross_entropy_loss_fusion = True - cfg.model.cross_entropy_fusion_impl = "native" - - # Memory saving - cfg.model.recompute_granularity = None - cfg.model.recompute_modules = None - cfg.model.fine_grained_activation_offloading = False - cfg.model.offload_modules = None - - # FP8 & MXFP8 settings - # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default - # cfg.mixed_precision.fp8_recipe = "tensorwise" - # cfg.mixed_precision.fp8 = None - # cfg.mixed_precision.fp8_param_gather = False - # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False - cfg.optimizer.use_precision_aware_optimizer = False - cfg.optimizer.main_grads_dtype = torch.float32 - cfg.optimizer.main_params_dtype = torch.float32 - cfg.optimizer.exp_avg_dtype = torch.float32 - cfg.optimizer.exp_avg_sq_dtype = torch.float32 - cfg.model.moe_router_padding_for_fp8 = False - - # MoE Force Load Balancing - cfg.model.moe_router_force_load_balancing = False - - # PEFT config - Nemotron uses Mamba-specific target modules - mamba_target_modules = ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"] - if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]: - cfg.peft = default_peft_config(peft_scheme, target_modules=mamba_target_modules) - elif isinstance(peft_scheme, PEFT): - cfg.peft = peft_scheme - else: - # Default to LoRA with Mamba target modules - cfg.peft = LoRA( - target_modules=mamba_target_modules, - dim=32, - alpha=32, - dropout=0.0, - dropout_position="pre", - lora_A_init_method="xavier", - lora_B_init_method="zero", - ) - - # Training config overrides - cfg.validation.eval_interval = 500 - - # Dataset config - packed_sequence=True by default (from _peft_common), seq_length=2048 - # _peft_common already sets seq_length=2048 and packed_sequence=True - # Adjust pad_seq_to_mult for context parallelism - if cfg.model.context_parallel_size > 1: - cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2 - - # Optimizer overrides - cfg.optimizer.adam_beta2 = 0.95 - cfg.optimizer.adam_eps = 1e-8 - cfg.optimizer.weight_decay = 0.1 - cfg.scheduler.start_weight_decay = 0.1 - cfg.scheduler.end_weight_decay = 0.1 - cfg.scheduler.lr_decay_style = "cosine" - - # Tokenizer - cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" - - # Checkpoint config overrides - cfg.checkpoint.save_interval = 200 - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.dist_ckpt_strictness = "log_all" - cfg.checkpoint.ckpt_assume_constant_structure = True - # Uncomment below if using a pretrained checkpoint and provide path to the directory containing pretrained model for finetuning - # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint" - - # Logger config - cfg.logger.log_interval = 10 - cfg.logger.log_timers_to_tensorboard = False - - # RNG config - Nemotron uses seed 1234 - cfg.rng.seed = 1234 - - # DDP config - cfg.ddp.check_for_nan_in_grad = True - cfg.ddp.grad_reduce_in_fp32 = True - cfg.ddp.overlap_grad_reduce = True - cfg.ddp.overlap_param_gather = True - cfg.ddp.use_distributed_optimizer = True - - # Communication overlap settings(default None, can pass CommOverlapConfig for advanced overlap), uncomment to enable - # cfg.comm_overlap = CommOverlapConfig( - # tp_comm_bootstrap_backend="nccl", - # tp_comm_overlap=True, - # ) - # cfg.comm_overlap.delay_wgrad_compute = False - # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False - - return cfg - - -__all__ = [ - # Pretrain config - "nemotron_3_nano_pretrain_config", - # SFT config - "nemotron_3_nano_sft_config", - # PEFT config - "nemotron_3_nano_peft_config", -] - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/performant_lora.png -```png -[Binary file] -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/tp_comm_overlap.png -```png -[Binary file] -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/callbacks.md -```md -# Callbacks - -Megatron Bridge provides a lightweight callback system for injecting custom logic into the training and evaluation loop without modifying framework code. This is ideal for propietary integrations or custom logging and metrics tracking. - -## Quick Start - -### Class-Based Callbacks - -Subclass {py:class}`bridge.training.callbacks.Callback` and override event methods: - -```python -import time - -from megatron.bridge.training.callbacks import Callback -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.recipes.qwen import qwen25_500m_pretrain_config - -class MyCallback(Callback): - def on_train_start(self, context): - context.user_state['start_time'] = time.time() - print(f"Training started at step {context.state.train_state.step}") - - def on_train_step_end(self, context): - if context.loss_dict: - print(f"Step {context.state.train_state.step}: loss={context.loss_dict}") - - def on_train_end(self, context): - elapsed = time.time() - context.user_state['start_time'] - print(f"Training completed in {elapsed:.2f}s") - -# Create a config that fits on a single GPU -config = qwen25_500m_pretrain_config() - -# Pass callbacks to pretrain -pretrain(config, forward_step, callbacks=[MyCallback()]) -``` - -### Functional Callbacks - -Register functions directly with {py:class}`bridge.training.callbacks.CallbackManager`: - -```python -from megatron.bridge.training.callbacks import CallbackManager -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.recipes.qwen import qwen25_500m_pretrain_config - -def log_step(context): - step = context.state.train_state.step - if context.loss_dict: - print(f"Step {step}: {context.loss_dict}") - -callback_manager = CallbackManager() -callback_manager.register("on_train_step_end", log_step) - -# Create a config that fits on a single GPU -config = qwen25_500m_pretrain_config() - -pretrain(config, forward_step, callbacks=callback_manager) -``` - -### Mixing Both Patterns - -Both registration patterns can be combined: - -```python -from megatron.bridge.training.callbacks import CallbackManager -from megatron.bridge.training.gpt_step import forward_step -from megatron.bridge.training.pretrain import pretrain -from megatron.bridge.recipes.qwen import qwen25_500m_pretrain_config - -manager = CallbackManager() -manager.add(MyCallback()) -manager.add([TimingCallback(), MetricsCallback()]) -manager.register("on_eval_end", lambda ctx: print("Evaluation complete!")) - -# Create a config that fits on a single GPU -config = qwen25_500m_pretrain_config() - -pretrain(config, forward_step, callbacks=manager) -``` - -## Available Events - -### Training Events - -| Event | When Fired | Available Context Fields | -|-------|------------|-------------------------| -| `on_train_start` | After `model.train()`, before training loop | `state`, `model`, `user_state`, `optimizer`, `scheduler` | -| `on_train_step_start` | Before each training step | `state`, `model`, `user_state`, `optimizer`, `scheduler` | -| `on_train_step_end` | After each training step | `state`, `model`, `user_state`, `optimizer`, `scheduler`, `loss_dict`, `grad_norm`, `skipped_iter` | -| `on_train_end` | After training loop completes | `state`, `model`, `user_state`, `optimizer`, `scheduler` | - -### Validation Events - -| Event | When Fired | Available Context Fields | -|-------|------------|-------------------------| -| `on_eval_start` | After `model.eval()`, before validation loop | `state`, `model`, `user_state` | -| `on_eval_step_start` | Before each validation step | `state`, `model`, `user_state` | -| `on_eval_step_end` | After each validation step | `state`, `model`, `user_state` | -| `on_eval_end` | After validation completes | `state`, `model`, `user_state`, `total_loss_dict` | - -### Test Events - -| Event | When Fired | Available Context Fields | -|-------|------------|-------------------------| -| `on_test_start` | After `model.eval()`, before test loop | `state`, `model`, `user_state` | -| `on_test_step_start` | Before each test step | `state`, `model`, `user_state` | -| `on_test_step_end` | After each test step | `state`, `model`, `user_state` | -| `on_test_end` | After test completes | `state`, `model`, `user_state`, `total_loss_dict` | - -### Checkpoint Events -| Event | When Fired | Available Context Fields | -|-------|------------|-------------------------| -| `on_checkpoint_save` | When checkpoint was saved| `state`, `model`, `user_state`, `optimizer` | - - -## CallbackContext - -The {py:class}`bridge.training.callbacks.CallbackContext` provides access to framework state: - -### Always Available - -- **`state`**: {py:class}`bridge.training.state.GlobalState` - Contains config, train_state, timers, and loggers -- **`model`**: List of model chunks -- **`user_state`**: Mutable dict for storing data across callback invocations - -### Training Events Only - -- **`optimizer`**: The optimizer instance -- **`scheduler`**: Learning rate scheduler - -### Event-Specific Fields - -- **`loss_dict`** (`on_train_step_end`): Dictionary of reduced losses from the training step -- **`grad_norm`** (`on_train_step_end`): Gradient norm (if computed) -- **`skipped_iter`** (`on_train_step_end`): Whether the iteration was skipped -- **`total_loss_dict`** (`on_eval_end`, `on_test_end`): Aggregated evaluation/test losses - -## User State - -The `CallbackManager` owns a `user_state` dictionary that persists across all callback invocations during a training run. Use it to share data between callbacks or accumulate metrics: - -```python -class StepCounterCallback(Callback): - def on_train_start(self, context): - context.user_state['callback_step_count'] = 0 - - def on_train_step_end(self, context): - context.user_state['callback_step_count'] += 1 - - def on_train_end(self, context): - print(f"Callback saw {context.user_state['callback_step_count']} steps") -``` - -## Distributed Training - -Callbacks fire on **all ranks** without framework-level synchronization. If your callback should only run on specific ranks, add guards: - -```python -import torch.distributed as dist - -class RankZeroCallback(Callback): - def on_train_step_end(self, context): - if dist.get_rank() == 0: - print(f"Step {context.state.train_state.step} complete") -``` - -## Exception Handling - -Exceptions from callbacks propagate to the caller. The framework does not catch or handle callback exceptions. If your callback might fail, wrap it in a try-except: - -```python -def safe_callback(context): - try: - # Your logic here - external_service.log(context.loss_dict) - except Exception as e: - print(f"Callback failed: {e}") - # Don't re-raise to avoid stopping training -``` - -## Execution Order - -Callbacks fire in registration order: - -1. Callbacks added via `add()` fire in the order they were added -2. Callbacks registered via `register()` fire in the order they were registered -3. If both methods are used, the order depends on when each was called - -## Introspection - -Query registered callbacks: - -```python -manager = CallbackManager() -manager.register("on_train_start", my_fn) - -# Check if any callbacks exist for an event -if manager.has_callbacks("on_train_start"): - print("Callbacks registered for on_train_start") - -# List all callbacks for an event -callbacks = manager.list_callbacks("on_train_start") -print(f"Found {len(callbacks)} callbacks") - -# Get all valid event names -print(manager.events) # frozenset of valid event names -``` - -## Design Principles - -The callback system follows these principles: - -1. **First-Party Isolation**: Framework code never uses callbacks for its own logic. Callbacks are strictly for third-party extensions. - -2. **Zero Overhead**: When no callbacks are registered, there is zero performance overhead. - -3. **Safety**: Callbacks receive framework state but modifying it is at the user's own risk. The framework makes no guarantees about the effects of modifications. - -## Examples - -### Proprietary Metrics - -```python -class ProprietaryMetricsCallback(Callback): - """Send metrics to internal monitoring system.""" - - def __init__(self, endpoint: str): - self.client = InternalMetricsClient(endpoint) - - def on_train_step_end(self, context): - if context.loss_dict: - self.client.send({ - "step": context.state.train_state.step, - "loss": context.loss_dict.get("lm loss"), - "grad_norm": context.grad_norm, - "cluster_id": os.environ.get("CLUSTER_ID"), - }) -``` - -## API Reference - -- {py:class}`bridge.training.callbacks.Callback` -- {py:class}`bridge.training.callbacks.CallbackContext` -- {py:class}`bridge.training.callbacks.CallbackManager` -- {py:func}`bridge.training.callbacks.normalize_callbacks` -- {py:func}`bridge.training.callbacks.should_fire` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/nemotron3.md -```md -# Nemotron 3 Nano -[Nemotron 3 Nano](https://huggingface.co/collections/nvidia/nvidia-nemotron-v3) is a large language model (LLM) trained from scratch by NVIDIA, and designed as a unified model for both reasoning and non-reasoning tasks. The model employs a hybrid Mixture-of-Experts (MoE) architecture, consisting of 23 Mamba-2 and MoE layers, along with 6 Attention layers. Each MoE layer includes 128 experts plus 1 shared expert, with 5 experts activated per token. The model has 3.5B active parameters and 30B parameters in total. - -NeMo Megatron Bridge supports pretraining, full parameters finetuning, and LoRA finetuning this model. The finetuned model can be converted back to the 🤗 Hugging Face format for downstream evaluation. - -```{important} -Please use the custom container `nvcr.io/nvidia/nemo:25.11.nemotron_3_nano` when working with this model. - -Run all commands from `/opt/Megatron-Bridge` (e.g. `docker run -w /opt/Megatron-Bridge ...`) -``` - -```{tip} -We use the following environment variables throughout this page -- `HF_MODEL_ID=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` -- `MEGATRON_MODEL_PATH=/models/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` (feel free to set your own path) -``` - - -## Conversion with 🤗 Hugging Face - -### Import HF → Megatron -To import the HF model to your desired `$MEGATRON_MODEL_PATH`, run the following command. -```bash -python examples/conversion/convert_checkpoints.py import \ ---hf-model $HF_MODEL_ID \ ---megatron-path /path/to/output/megatron/ckpt \ ---trust-remote-code -``` - -### Export Megatron → HF -```bash -python examples/conversion/convert_checkpoints.py export \ ---hf-model $HF_MODEL_ID \ ---megatron-path /path/to/trained/megatron/ckpt \ ---hf-path /path/to/output/hf/ckpt -``` - -## Pretraining Examples -```bash -BLEND_PATH=/path/to/dataset/blend -TOKENIZER_MODEL=/path/to/tiktok/tokenizer/model - -torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_nano.py \ ---per-split-data-args-path=${BLEND_PATH} \ ---tokenizer-model=${TOKENIZER_MODEL} \ -train.global_batch_size=3072 \ -train.train_iters=39500 \ -scheduler.lr_warmup_iters=350 -``` - -Notes: -- The default parallelism settings are TP=4, EP=8, PP=1, CP=1. It is recommended to run this pretraining on 4 H100 nodes (32 GPUs). -- To enable wandb logging, you can append `logger.wandb_project=PROJECT_NAME`, `wandb_entity=ENTITY_NAME`, and `wandb_exp_name=EXP_NAME` arguments -- If `BLEND_PATH` and `TOKENIZER_MODEL` are not specified, mock dataset will be used. - - -## Finetuning Recipes - -### Full Parameter Fine-Tuning -```bash -torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \ -train.global_batch_size=128 \ -train.train_iters=100 \ -scheduler.lr_warmup_iters=10 \ -checkpoint.pretrained_checkpoint=/path/to/output/megatron/ckpt -``` - -Notes: -- Default parallelism TP=1, EP=8, PP=1, CP=1. Running this recipe requires at least 2 H100 nodes (16 GPUs). -- By default, the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad) dataset is used. To use customerized dataset, see this [tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/recipes/llama#quickstart) -- Fine-tuning requires a pretrained megatron checkpoint, which can be obtained in "Import HF → Megatron" section above - - -### LoRA Fine-Tuning -To enable LoRA fine-tuning, pass `--peft lora` to script -```bash -torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \ ---peft lora \ -train.global_batch_size=128 \ -train.train_iters=100 \ -scheduler.lr_warmup_iters=10 \ -checkpoint.pretrained_checkpoint=/path/to/output/megatron/ckpt -``` - -Notes: -- By default, the target modules are linear layers `["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"]` in the model -- The rest of settings are the same as full parameter fine-tuning above. - - -A LoRA checkpoint only contains the learnable adapter weights. In order to convert the LoRA checkpoint to Hugging Face format for downstream evaluation, it is necessary to merge the LoRA adapters back to the base model. - -```bash -python examples/peft/merge_lora.py \ ---hf-model-path $HF_MODEL_ID \ ---lora-checkpoint /path/to/lora/ckpt/iter_xxxxxxx ---output /path/to/merged/ckpt -``` -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/index.md -```md -```{include} ../README.md -:relative-docs: docs/ -``` - -```{toctree} -:caption: Guides -:hidden: - -parallelisms.md -performance-summary.md -performance-summary-archive.md -performance-guide.md -recipe-usage.md -nemo2-migration-guide.md -megatron-lm-to-megatron-bridge.md -``` - -```{toctree} -:caption: Bridge with 🤗 Hugging Face -:hidden: - -bridge-guide.md -bridge-tech-details.md -``` - -```{toctree} -:caption: Supported Models -:hidden: - -models/llm/index.md -models/vlm/index.md -``` - -```{toctree} -:caption: Training and Customization -:hidden: - -training/config-container-overview.md -training/entry-points.md -training/training-loop-settings.md -training/optimizer-scheduler.md -training/logging.md -training/profiling.md -training/checkpointing.md -training/megatron-fsdp.md -training/resiliency.md -training/mixed-precision.md -training/cuda-graphs.md -training/hybrid-context-parallel.md -training/communication-overlap.md -training/attention-optimizations.md -training/activation-recomputation.md -training/cpu-offloading.md -training/peft.md -training/packed-sequences.md -training/multi-token-prediction.md -training/distillation.md -training/pruning.md -training/callbacks.md -``` - -```{toctree} -:caption: Model Optimization with ModelOpt -:hidden: - -modelopt/quantization.md -``` - -```{toctree} -:caption: Development -:hidden: - -adding-new-models.md -bridge-rl-integration.md -documentation.md -apidocs/index.rst -``` - -```{toctree} -:caption: Releases -:hidden: - -releases/release-process.md -releases/software-versions.md -releases/changelog.md -releases/known-issues.md -``` - -```{toctree} -:caption: Agent Skills -:hidden: - -skills-index -``` - -```{toctree} -:caption: Directory Readme Files -:hidden: - -README.md -models/README.md -models/llm/README.md -models/vlm/README.md -releases/README.md -training/README.md -modelopt/README.md -``` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/__init__.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Nemotron Nano v2 models -# Nemotron 3 Nano models -from megatron.bridge.recipes.nemotronh.nemotron_3_nano import ( - nemotron_3_nano_peft_config, - nemotron_3_nano_pretrain_config, - nemotron_3_nano_sft_config, -) - -# Nemotron 3 Super models -from megatron.bridge.recipes.nemotronh.nemotron_3_super import ( - nemotron_3_super_peft_config, - nemotron_3_super_pretrain_config, - nemotron_3_super_sft_config, -) -from megatron.bridge.recipes.nemotronh.nemotron_nano_v2 import ( - nemotron_nano_9b_v2_peft_config, - nemotron_nano_9b_v2_pretrain_config, - nemotron_nano_9b_v2_sft_config, - nemotron_nano_12b_v2_peft_config, - nemotron_nano_12b_v2_pretrain_config, - nemotron_nano_12b_v2_sft_config, -) - -# NemotronH models -from megatron.bridge.recipes.nemotronh.nemotronh import ( - nemotronh_4b_peft_config, - nemotronh_4b_pretrain_config, - nemotronh_4b_sft_config, - nemotronh_8b_peft_config, - nemotronh_8b_pretrain_config, - nemotronh_8b_sft_config, - nemotronh_47b_peft_config, - nemotronh_47b_pretrain_config, - nemotronh_47b_sft_config, - nemotronh_56b_peft_config, - nemotronh_56b_pretrain_config, - nemotronh_56b_sft_config, -) - - -__all__ = [ - # NemotronH models - "nemotronh_4b_pretrain_config", - "nemotronh_8b_pretrain_config", - "nemotronh_47b_pretrain_config", - "nemotronh_56b_pretrain_config", - "nemotronh_4b_sft_config", - "nemotronh_8b_sft_config", - "nemotronh_47b_sft_config", - "nemotronh_56b_sft_config", - "nemotronh_4b_peft_config", - "nemotronh_8b_peft_config", - "nemotronh_47b_peft_config", - "nemotronh_56b_peft_config", - # Nemotron Nano v2 models - "nemotron_nano_9b_v2_pretrain_config", - "nemotron_nano_12b_v2_pretrain_config", - "nemotron_nano_9b_v2_sft_config", - "nemotron_nano_12b_v2_sft_config", - "nemotron_nano_9b_v2_peft_config", - "nemotron_nano_12b_v2_peft_config", - # Nemotron 3 Nano models - "nemotron_3_nano_pretrain_config", - "nemotron_3_nano_sft_config", - "nemotron_3_nano_peft_config", - # Nemotron 3 Super models - "nemotron_3_super_pretrain_config", - "nemotron_3_super_sft_config", - "nemotron_3_super_peft_config", -] - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/index.md -```md -# Large Language Models - -This section documents Large Language Models supported by Megatron Bridge, with examples for converting to/from 🤗 Hugging Face and links to training recipes. - -```{toctree} -:hidden: - -deepseek-v2.md -deepseek-v3.md -gemma2.md -gemma3.md -glm45.md -gpt-oss.md -llama3.md -llama-nemotron.md -mistral.md -moonlight.md -nemotron3.md -nemotron3-super.md -nemotronh.md -olmoe.md -qwen.md -``` -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/activation-recomputation-example-1.jpg -```jpg -[Binary file] -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/utils/dataset_utils.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Dataset configuration utilities for recipes and training scripts.""" - -import logging -from typing import Callable, List, Optional, Tuple - -from megatron.bridge.data.energon.energon_provider import EnergonProvider -from megatron.bridge.data.loaders import get_blend_and_blend_per_split -from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider -from megatron.bridge.recipes.utils.finetune_utils import ( - default_gsm8k_config, - default_openmathinstruct2_config, - default_squad_config, -) -from megatron.bridge.training.config import ( - ConfigContainer, - FinetuningDatasetConfig, - GPTDatasetConfig, - MockGPTDatasetConfig, -) - - -logger = logging.getLogger(__name__) - - -_BLEND_TYPE = Optional[Tuple[List[str], Optional[List[float]]]] -_BLEND_PER_SPLIT_TYPE = Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] -_SPLIT_TYPE = Optional[str] - - -def get_blend_fields_from_data_paths( - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, -) -> Tuple[_BLEND_TYPE, _BLEND_PER_SPLIT_TYPE, _SPLIT_TYPE]: - """ - Common configuration logic for blend, blend_per_split, split dataset config fields. - - Handles mock and real data. If no path to data is provided, mock data will be used. - Prioritizes `data_paths` over split data paths. For all of `data_paths`, `train_data_path`, - `valid_data_path`, and `test_data_path`, two formats are accepted: either (1) a list of prefixes, - e.g. ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], or (2) a flattened, zipped - list of weights and prefixes, e.g. ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"] - - Args: - data_paths (Optional[List[str]]): List of paths to dataset files. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - - Returns: - A tuple (blend, blend_per_split, split), the corresponding fields to be passed to GPTDatasetConfig. - """ - has_any_data_config = any( - [data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path] - ) - - if mock or not has_any_data_config: - # Mock data configuration - blend = None # Will trigger mock mode automatically - blend_per_split = None # Will trigger mock mode automatically - split = "1,1,1" # Equal splits for testing - else: - # Real data configuration - blend, blend_per_split = get_blend_and_blend_per_split( - data_paths=data_paths, - data_args_path=data_args_path, - train_data_paths=train_data_path, - valid_data_paths=valid_data_path, - test_data_paths=test_data_path, - per_split_data_args_path=per_split_data_args_path, - ) - - if blend_per_split is not None: - # When using blend_per_split, split should be None - split = None - elif blend is not None: - # When using regular blend, we can use split - split = "9999,8,2" - else: - # No data provided, fall back to mock mode - split = "1,1,1" - - return blend, blend_per_split, split - - -# --------------------------------------------------------------------------- -# Unified dataset type registry -# --------------------------------------------------------------------------- - -DATASET_TYPES = [ - "llm-pretrain", - "llm-pretrain-mock", - "llm-finetune", - "llm-finetune-preloaded", - "vlm-energon", - "vlm-hf", - "vlm-preloaded", -] - -LLM_FINETUNE_PRESETS: dict[str, Callable] = { - "squad": default_squad_config, - "openmathinstruct2": default_openmathinstruct2_config, - "gsm8k": default_gsm8k_config, -} - - -def extract_and_remove_override(cli_overrides: list[str], key: str, default: str | None = None) -> str | None: - """Extract a Hydra-style override (key=value) from *cli_overrides* and remove it. - - Returns the value if found, otherwise *default*. - """ - prefix = f"{key}=" - for i, override in enumerate(cli_overrides): - if override.startswith(prefix): - value = override[len(prefix) :] - cli_overrides.pop(i) - return value - return default - - -def _resolve_seq_length(config: ConfigContainer, seq_length: int | None) -> int: - """Resolve sequence length: explicit arg > model config > 4096 fallback.""" - if seq_length is not None: - return seq_length - if hasattr(config, "model") and config.model is not None and hasattr(config.model, "seq_length"): - return config.model.seq_length - return 4096 - - -def apply_dataset_override( - config: ConfigContainer, - dataset_type: str, - packed_sequence: bool = False, - seq_length: int | None = None, - cli_overrides: list[str] | None = None, -) -> ConfigContainer: - """Replace the recipe's dataset config based on the requested dataset type. - - Args: - config: The recipe config to modify. - dataset_type: One of :data:`DATASET_TYPES`. - packed_sequence: Whether to enable packed sequences. - seq_length: Explicit sequence length (None = use model's or default 4096). - cli_overrides: Mutable list of Hydra-style CLI overrides. For ``llm-finetune``, - ``dataset.dataset_name`` is extracted and consumed here to select the preset. - - Returns: - The modified ConfigContainer. - """ - resolved_seq_length = _resolve_seq_length(config, seq_length) - if cli_overrides is None: - cli_overrides = [] - - if dataset_type == "llm-pretrain": - config.dataset = GPTDatasetConfig( - seq_length=resolved_seq_length, - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - num_dataset_builder_threads=1, - blend=None, - blend_per_split=None, - split="9999,8,2", - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ) - - elif dataset_type == "llm-pretrain-mock": - config.dataset = MockGPTDatasetConfig( - seq_length=resolved_seq_length, - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - num_dataset_builder_threads=1, - split="9999,8,2", - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ) - - elif dataset_type == "llm-finetune": - preset_name = extract_and_remove_override(cli_overrides, "dataset.dataset_name", default="squad") - if preset_name not in LLM_FINETUNE_PRESETS: - raise ValueError( - f"Unknown finetune dataset preset: '{preset_name}'. " - f"Choose from: {', '.join(sorted(LLM_FINETUNE_PRESETS.keys()))}" - ) - factory = LLM_FINETUNE_PRESETS[preset_name] - kwargs: dict = {"packed_sequence": packed_sequence, "pad_seq_to_mult": 1} - kwargs["seq_length"] = resolved_seq_length - config.dataset = factory(**kwargs) - - elif dataset_type == "llm-finetune-preloaded": - config.dataset = FinetuningDatasetConfig( - seq_length=resolved_seq_length, - dataset_root=None, - dataloader_type="batch", - seed=5678, - ) - - elif dataset_type == "vlm-energon": - if isinstance(config.dataset, EnergonProvider): - logger.info("Recipe already provides EnergonProvider; keeping it (preserves task_encoder).") - else: - logger.warning( - "Creating bare EnergonProvider. task_encoder and image_processor are unset; " - "use a recipe that provides them or set via code." - ) - config.dataset = EnergonProvider( - path="", - seq_length=resolved_seq_length, - micro_batch_size=config.train.micro_batch_size, - global_batch_size=config.train.global_batch_size, - num_workers=2, - ) - - elif dataset_type == "vlm-hf": - config.dataset = HFDatasetConversationProvider( - seq_length=resolved_seq_length, - hf_processor_path=None, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - pack_sequences_in_batch=False, - ) - - elif dataset_type == "vlm-preloaded": - config.dataset = PreloadedVLMConversationProvider( - seq_length=resolved_seq_length, - hf_processor_path=None, - train_data_path=None, - valid_data_path=None, - test_data_path=None, - dataloader_type="single", - num_workers=2, - ) - - else: - raise ValueError(f"Unknown dataset type: '{dataset_type}'. Choose from: {', '.join(DATASET_TYPES)}") - - if seq_length is not None and hasattr(config, "model") and config.model is not None: - config.model.seq_length = seq_length - - return config - - -def infer_mode_from_dataset(dataset_type: str) -> str: - """Infer training mode from the dataset type prefix.""" - if dataset_type.startswith("llm-pretrain"): - return "pretrain" - return "finetune" - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/__init__.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Megatron Bridge Recipe Configurations - -This module exposes all recipe configurations from all model families. -""" - -from megatron.bridge.diffusion.recipes.flux.flux import * -from megatron.bridge.diffusion.recipes.wan.wan import * -from megatron.bridge.recipes.deepseek import * -from megatron.bridge.recipes.gemma import * -from megatron.bridge.recipes.gemma3_vl import * -from megatron.bridge.recipes.glm import * -from megatron.bridge.recipes.glm_vl import * -from megatron.bridge.recipes.gpt import * -from megatron.bridge.recipes.gpt_oss import * -from megatron.bridge.recipes.kimi_vl import * -from megatron.bridge.recipes.llama import * -from megatron.bridge.recipes.ministral3 import * -from megatron.bridge.recipes.moonlight import * -from megatron.bridge.recipes.nemotronh import * -from megatron.bridge.recipes.olmoe import * -from megatron.bridge.recipes.qwen import * -from megatron.bridge.recipes.qwen2_audio import * -from megatron.bridge.recipes.qwen_vl import * - -``` -
- - -Write a comprehensive tutorial about pretraining in Megatron-Bridge. Cover the end-to-end workflow (config recipes, dataset wiring, launch methods, core entry points, scaling/perf options, and practical examples), grounded in the selected Megatron-Bridge docs and source files. - - -- Documentation layer (`docs/`): conceptual guidance for training configuration, loop controls, optimization, distributed parallelisms, performance tuning, checkpointing, and resiliency. -- Recipe layer (`src/megatron/bridge/recipes`): reusable `ConfigContainer` defaults (`common.py`) and model-specific pretrain configs (Nemotron 3 Nano/Super). -- Launch layer (`scripts/training`): generic recipe runner (`run_recipe.py`) plus NeMo-Run/Slurm launch wrappers. -- Execution layer (`src/megatron/bridge/training`): `pretrain()` orchestration, setup/bootstrap, and GPT forward-step behavior. -- Example layer (`examples/models/*`): concrete pretraining commands and Slurm job templates for Nemotron 3 and GPT-OSS. - - - -Megatron-Bridge/docs/recipe-usage.md: Central guide for recipe-based pretraining, override patterns (Python/YAML/Hydra-style), and launch methods (`torchrun`, NeMo-Run). -Megatron-Bridge/docs/training/*.md: Full training docs set covering config container, entry points, training-loop settings, optimizer/scheduler, logging, checkpointing, mixed precision, callbacks, profiling, communication overlap, FSDP, resiliency, and related tuning topics. -Megatron-Bridge/docs/parallelisms.md: Detailed distributed parallelism reference used by pretraining tutorials (TP/PP/EP/CP/SP and tradeoffs). -Megatron-Bridge/docs/performance-guide.md and performance-summary.md: Practical performance recommendations and quick reference. -Megatron-Bridge/docs/models/llm/nemotron3.md: Nemotron 3 Nano pretraining walkthrough and command examples. -Megatron-Bridge/docs/models/llm/nemotron3-super.md: Nemotron 3 Super pretraining workflow, hardware/parallelism requirements, and command examples. -Megatron-Bridge/docs/models/llm/nemotronh.md: Nemotron H/Nano v2 model family context and recipe usage patterns relevant to pretraining narrative. -Megatron-Bridge/scripts/training/run_recipe.py: Generic CLI training entry script; loads recipe, applies dataset override and Hydra-style config overrides, selects step function, dispatches `pretrain`/`finetune`. -Megatron-Bridge/scripts/training/README.md: User-facing launcher usage and common command patterns for pretraining. -Megatron-Bridge/scripts/training/launch_with_nemo_run.py and launch_with_sbatch.sh: Multi-node orchestration patterns and practical launch templates. -Megatron-Bridge/src/megatron/bridge/recipes/common.py: `_pretrain_common()` baseline defaults (train/scheduler/logger/checkpoint/tokenizer/dataset scaffolding). -Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py: `nemotron_3_nano_pretrain_config()` model-specific overrides on top of `_pretrain_common`. -Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_super.py: `nemotron_3_super_pretrain_config()` model-specific overrides and pretrained bridge references. -Megatron-Bridge/src/megatron/bridge/recipes/utils/dataset_utils.py: dataset-type mapping and `apply_dataset_override()` logic used by generic scripts. -Megatron-Bridge/src/megatron/bridge/training/pretrain.py: top-level pretraining orchestration (`runtime_config_update` → `setup` → `train`/eval/test lifecycle). -Megatron-Bridge/src/megatron/bridge/training/setup.py: initialization path (dist setup, tokenizer/model/optimizer construction, checkpoint load, data iterator setup). -Megatron-Bridge/src/megatron/bridge/training/gpt_step.py: canonical GPT forward-step batch handling and loss wiring for tutorial code-path explanation. -Megatron-Bridge/src/megatron/bridge/training/utils/omegaconf_utils.py: conversion/merge/apply utilities for YAML and CLI overrides used in examples. -Megatron-Bridge/examples/models/nemotron_3/*/pretrain_*.py: concrete Python pretraining entry scripts with override handling and `pretrain(config, forward_step_func)` invocation. -Megatron-Bridge/examples/models/nemotron_3/*/slurm_pretrain.sh: production-style Slurm launch templates and common runtime env/override patterns. -Megatron-Bridge/examples/models/gpt_oss/README.md + slurm_pretrain.sh: additional pretraining recipe/launcher example set for a different model family. - - - -- `scripts/training/run_recipe.py` -> `megatron.bridge.recipes.()` -> `ConfigContainer` -> `training.pretrain.pretrain()`. -- `recipes/common.py::_pretrain_common()` provides shared defaults; model recipe functions (e.g., Nemotron 3 Nano/Super) specialize those defaults. -- `dataset_utils.apply_dataset_override()` mutates recipe config according to `--dataset` mode before final override processing. -- `training.pretrain.pretrain()` calls `training.setup.setup()` to build runtime state and then drives training/eval flow. -- `training.setup.setup()` wires tokenizer + model + optimizer + scheduler + checkpoint + dataloaders; then returns objects used by train loop. -- Example scripts (`examples/models/.../pretrain_*.py`) are minimal wrappers around recipe creation + OmegaConf/Hydra override parsing + `pretrain()` call. -- Docs map directly to these layers: recipe usage and training docs explain the same fields/functions exercised by scripts and source. - - - -- Scope-specific: selection is constrained to `Megatron-Bridge` only. -- Tutorial target audience depth is not explicitly specified (beginner vs advanced), so structure should be broadly accessible while preserving advanced sections (distributed/perf/resiliency). - - diff --git a/skills/nemotron-customize/context/mbridge-sft-full.txt b/skills/nemotron-customize/context/mbridge-sft-full.txt deleted file mode 100644 index 7560a7b5d..000000000 --- a/skills/nemotron-customize/context/mbridge-sft-full.txt +++ /dev/null @@ -1,8632 +0,0 @@ - -/Users/mromeijn/src/Megatron-Bridge -├── docs -│ ├── training -│ │ ├── images -│ │ ├── README.md * -│ │ └── packed-sequences.md * -│ ├── images -│ ├── modelopt -│ ├── models -│ │ ├── llm -│ │ └── vlm -│ ├── releases -│ ├── bridge-guide.md * -│ ├── parallelisms.md * -│ └── recipe-usage.md * -├── scripts -│ ├── training -│ │ └── run_recipe.py * + -│ └── performance -│ ├── configs -│ │ ├── deepseek -│ │ ├── gpt_oss -│ │ ├── kimi -│ │ ├── llama -│ │ ├── nemotronh -│ │ ├── qwen -│ │ └── qwen_vl -│ └── utils -├── skills -│ ├── mlm-bridge-training -│ │ └── SKILL.md * -│ ├── perf-techniques -│ │ ├── sequence-packing -│ │ │ └── SKILL.md * -│ │ ├── cuda-graphs -│ │ ├── expert-parallel-overlap -│ │ ├── hybrid-context-parallel -│ │ ├── megatron-fsdp -│ │ ├── moe-comm-overlap -│ │ ├── packed-sequences-long-context -│ │ ├── parallelism-strategies -│ │ └── tp-dp-comm-overlap -│ ├── adding-model-support -│ ├── code-style -│ ├── developer-guide -│ ├── multi-node-slurm -│ ├── parity-testing -│ └── resiliency -├── src -│ └── megatron -│ └── bridge -│ ├── data -│ │ ├── builders -│ │ │ └── finetuning_dataset.py * + -│ │ ├── datasets -│ │ │ ├── packed_parquet.py * + -│ │ │ ├── packed_sequence.py * + -│ │ │ └── sft.py * + -│ │ ├── ... -│ ├── recipes -│ │ ├── utils -│ │ │ ├── dataset_utils.py * + -│ │ │ └── finetune_utils.py * + -│ │ ├── common.py * + -│ │ ├── ... -│ ├── training -│ │ ├── config.py * + -│ │ ├── finetune.py * + -│ │ ├── gpt_step.py * + -│ │ ├── ... -│ ├── diffusion -│ │ └── ... -│ ├── inference -│ │ └── ... -│ ├── models -│ │ └── ... -│ ├── peft -│ └── utils -├── .github -│ ├── ISSUE_TEMPLATE -│ ├── actions -│ │ └── test-template -│ └── workflows -│ └── config -├── .specstory -├── 3rdparty -│ └── Megatron-LM -│ ├── .github -│ │ ├── ISSUE_TEMPLATE -│ │ ├── actions -│ │ │ └── ... -│ │ ├── scripts -│ │ └── workflows -│ │ └── ... -│ ├── .gitlab -│ │ ├── scripts -│ │ └── stages -│ ├── docker -│ │ ├── common -│ │ └── patches -│ ├── docs -│ │ ├── advanced -│ │ ├── api-guide -│ │ │ └── ... -│ │ ├── developer -│ │ ├── discussions -│ │ │ └── ... -│ │ ├── get-started -│ │ ├── images -│ │ │ └── ... -│ │ ├── models -│ │ └── user-guide -│ │ └── ... -│ ├── examples -│ │ ├── academic_paper_scripts -│ │ │ └── ... -│ │ ├── bert -│ │ ├── export -│ │ │ └── ... -│ │ ├── gpt3 -│ │ ├── inference -│ │ │ └── ... -│ │ ├── llama -│ │ ├── mamba -│ │ ├── mimo -│ │ │ └── ... -│ │ ├── mixtral -│ │ ├── multimodal -│ │ │ └── ... -│ │ ├── post_training -│ │ │ └── ... -│ │ ├── rl -│ │ │ └── ... -│ │ └── t5 -│ ├── images -│ ├── megatron -│ │ ├── core -│ │ │ └── ... -│ │ ├── inference -│ │ ├── legacy -│ │ │ └── ... -│ │ ├── post_training -│ │ ├── rl -│ │ │ └── ... -│ │ └── training -│ │ └── ... -│ ├── scripts -│ ├── tasks -│ ├── tests -│ │ ├── functional_tests -│ │ │ └── ... -│ │ ├── test_utils -│ │ │ └── ... -│ │ └── unit_tests -│ │ └── ... -│ └── tools -│ ├── bert_embedding -│ └── checkpoint -├── docker -│ ├── common -│ └── patches -├── examples -│ ├── conversion -│ │ ├── adapter -│ │ └── compare_hf_and_megatron -│ ├── decentralized_pg -│ ├── diffusion -│ │ └── recipes -│ │ ├── flux -│ │ │ └── ... -│ │ └── wan -│ │ └── ... -│ ├── distillation -│ │ └── llama -│ │ └── conf -│ ├── evaluation -│ │ └── utils -│ ├── inference -│ │ └── vlm -│ ├── long_context -│ ├── models -│ │ ├── audio_lm -│ │ │ ├── qwen2_audio -│ │ │ └── qwen3_asr -│ │ ├── bailing -│ │ ├── gpt_oss -│ │ ├── minimax_m2 -│ │ ├── nemotron_3 -│ │ │ ├── nano -│ │ │ └── super -│ │ ├── qwen3_next -│ │ │ └── conf -│ │ ├── sarvam -│ │ └── vlm -│ │ ├── gemma3_vl -│ │ ├── glm_45v -│ │ ├── kimi_k25_vl -│ │ ├── ministral3 -│ │ ├── nemotron_vl -│ │ │ └── ... -│ │ ├── qwen25_omni -│ │ ├── qwen35_vl -│ │ ├── qwen3_vl -│ │ └── qwen_vl -│ │ └── ... -│ ├── peft -│ ├── quantization -│ │ └── conf -│ ├── resiliency -│ │ ├── fault_tolerance -│ │ └── straggler_detection -│ └── rl -├── tests -│ ├── functional_tests -│ │ ├── data -│ │ │ ├── energon -│ │ │ └── hf_processors -│ │ ├── diffusion -│ │ │ ├── flux -│ │ │ └── wan -│ │ ├── inference -│ │ ├── launch_scripts -│ │ │ ├── active -│ │ │ └── flaky -│ │ ├── models -│ │ │ ├── qwen3_asr -│ │ │ └── qwen_audio -│ │ └── test_groups -│ │ ├── ckpts -│ │ │ └── ... -│ │ ├── converter -│ │ ├── data -│ │ │ └── ... -│ │ ├── diffusion -│ │ │ └── ... -│ │ ├── models -│ │ │ └── ... -│ │ ├── quantization -│ │ │ └── ... -│ │ ├── recipes -│ │ ├── training -│ │ └── utils -│ └── unit_tests -│ ├── data -│ │ ├── builders -│ │ ├── datasets -│ │ ├── energon -│ │ ├── mimo -│ │ └── vlm_datasets -│ ├── diffusion -│ │ ├── data -│ │ │ └── ... -│ │ ├── model -│ │ │ └── ... -│ │ └── recipes -│ │ └── ... -│ ├── inference -│ │ └── vlm -│ ├── models -│ │ ├── common -│ │ ├── decorators -│ │ ├── deepseek -│ │ ├── gemma -│ │ ├── gemma_vl -│ │ ├── glm -│ │ ├── glm_vl -│ │ ├── gpt -│ │ ├── gpt_oss -│ │ ├── hf_pretrained -│ │ ├── kimi -│ │ ├── kimi_vl -│ │ ├── llama -│ │ ├── llama_nemotron -│ │ ├── mamba -│ │ ├── mimo -│ │ ├── minimax_m2 -│ │ ├── ministral3 -│ │ ├── mistral -│ │ ├── nemotron -│ │ ├── nemotron_vl -│ │ ├── nemotronh -│ │ ├── olmoe -│ │ ├── qwen -│ │ ├── qwen3_asr -│ │ │ └── ... -│ │ ├── qwen_audio -│ │ ├── qwen_omni -│ │ │ └── ... -│ │ ├── qwen_vl -│ │ │ └── ... -│ │ └── sarvam -│ ├── peft -│ ├── recipes -│ │ ├── gemma -│ │ ├── gpt -│ │ ├── kimi -│ │ ├── nemotronh -│ │ ├── qwen -│ │ ├── qwen_vl -│ │ │ └── ... -│ │ └── utils -│ ├── scripts -│ │ └── performance -│ ├── training -│ │ ├── mimo -│ │ ├── mlm_compat -│ │ ├── post_training -│ │ └── utils -│ └── utils -└── tutorials - ├── data - │ └── dclm - ├── recipes - │ └── llama - │ └── conf - └── training - - -(* denotes selected files) -(+ denotes code-map available) -Config: directory-only view; depth cap 3; selected files shown. - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/tokenizers/megatron_tokenizer.py -Imports: - - import importlib - - import json - - import logging - - import os - - from collections import OrderedDict - - from typing import Optional, Union - - from megatron.core.tokenizers.base_tokenizer import MegatronTokenizerBase ---- -Classes: - - MegatronTokenizer - Methods: - - L40: def __init__(self) -> None: - - L46: def from_pretrained( - tokenizer_path: str = None, metadata_path: Optional[Union[str, dict]] = None, **kwargs - ) -> MegatronTokenizerBase: - - L104: def write_metadata( - tokenizer_path: str, - tokenizer_library: str, - model_type: Optional[str] = None, - tokenizer_class: Optional[MegatronTokenizerBase] = None, - chat_template: Optional[str] = None, - overwrite: Optional[bool] = False, - metadata_path: Optional[str] = None, - ) -> None: - -Functions: - - L170: def _get_metadata_path(tokenizer_path: str) -> str: - - L188: def _get_tokenizer_model_class(library: str, metadata: dict) -> MegatronTokenizerBase: - -Global vars: - - TOKENIZER_MAPPING_NAMES - - TEXT_LIBRARIES - - VISION_LIBRARIES - - logger ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/callbacks.py -Imports: - - import logging - - from collections.abc import Callable - - from dataclasses import dataclass, field - - from typing import TYPE_CHECKING - - import torch - - from megatron.core.optimizer import MegatronOptimizer - - from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler - - from megatron.core.transformer import MegatronModule - - from megatron.bridge.training.state import GlobalState ---- -Classes: - - CallbackContext - Properties: - - state - - model - - user_state - - optimizer - - scheduler - - loss_dict - - grad_norm - - skipped_iter - - total_loss_dict - - Callback - Methods: - - L145: def on_data_init_start(self, context: CallbackContext) -> None: - - L154: def on_train_start(self, context: CallbackContext) -> None: - - L158: def on_train_step_start(self, context: CallbackContext) -> None: - - L162: def on_train_step_end(self, context: CallbackContext) -> None: - - L166: def on_train_end(self, context: CallbackContext) -> None: - - L170: def on_eval_start(self, context: CallbackContext) -> None: - - L174: def on_eval_step_start(self, context: CallbackContext) -> None: - - L178: def on_eval_step_end(self, context: CallbackContext) -> None: - - L182: def on_eval_end(self, context: CallbackContext) -> None: - - L186: def on_test_start(self, context: CallbackContext) -> None: - - L190: def on_test_step_start(self, context: CallbackContext) -> None: - - L194: def on_test_step_end(self, context: CallbackContext) -> None: - - L198: def on_test_end(self, context: CallbackContext) -> None: - - L202: def on_checkpoint_save(self, context: CallbackContext) -> None: - - CallbackManager - Methods: - - L237: def __init__(self) -> None: - - L244: def user_state(self) -> dict: - - L248: def add(self, callback: Callback | list[Callback]) -> None: - - L273: def register(self, event_name: str, fn: Callable[[CallbackContext], None]) -> None: - - L308: def events(self) -> frozenset[str]: - - L312: def list_callbacks(self, event_name: str) -> list[Callable[[CallbackContext], None]]: - - L328: def has_callbacks(self, event_name: str) -> bool: - - L339: def fire(self, event_name: str, context: CallbackContext) -> None: - -Functions: - - L352: def normalize_callbacks( - callbacks: list[Callback] | CallbackManager | None, -) -> CallbackManager | None: - - L376: def should_fire(callback_manager: CallbackManager | None, event_name: str) -> bool: - -Global vars: - - logger - - VALID_EVENTS ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/state.py -Imports: - - import json - - import os - - import time - - import types - - from dataclasses import dataclass - - from typing import Any, Optional - - import torch - - from megatron.core.energy_monitor import EnergyMonitor - - from megatron.core.timers import Timers - - from megatron.core.utils import StragglerDetector - - from torch.distributed.checkpoint.stateful import Stateful - - from torch.utils.tensorboard.writer import SummaryWriter - - from megatron.core.dist_checkpointing.strategies.torch import get_async_strategy - - from megatron.bridge.training.config import ConfigContainer - - from megatron.bridge.training.nvrx_straggler import NVRxStragglerDetectionManager - - from megatron.bridge.training.tokenizers.tokenizer import build_tokenizer - - from megatron.bridge.training.utils.log_utils import safe_serialize - - from megatron.bridge.training.utils.sig_utils import DistributedSignalHandler - - from megatron.bridge.utils.common_utils import get_rank_safe, get_world_size_safe - - import wandb - - import mlflow - - import comet_ml - - import warnings ---- -Classes: - - TrainState - Methods: - - L63: def state_dict(self) -> dict[str, torch.Tensor]: - - L85: def load_state_dict(self, state_dict: dict[str, torch.Tensor]) -> None: - Properties: - - step - - consumed_train_samples - - skipped_train_samples - - consumed_valid_samples - - floating_point_operations_so_far - - do_train - - do_valid - - do_test - - FaultToleranceState - Properties: - - ft_state_path - - is_persistent_chkpt_loaded - - is_async_chkpt_enabled - - is_calculating_timeouts - - is_setup_section_open - - seen_checkpoints_cnt - - seen_tr_iters_cnt - - curr_eval_iter_idx - - GlobalState - Methods: - - L124: def __init__(self) -> None: - - L151: def cfg(self) -> Optional[ConfigContainer]: - - L156: def cfg(self, value: Optional[ConfigContainer]) -> None: - - L172: def tokenizer(self) -> Any: - - L179: def tensorboard_logger(self) -> Optional[SummaryWriter]: - - L195: def wandb_logger(self) -> Optional[Any]: - - L228: def mlflow_logger(self) -> Optional[Any]: - - L255: def _flatten_dict(d: dict[str, Any], parent_key: str = "", sep: str = ".") -> dict[str, Any]: - - L294: def comet_logger(self) -> Optional[Any]: - - L342: def timers(self) -> Timers: - - L352: def train_state(self) -> TrainState: - - L359: def train_state(self, value: TrainState) -> None: - - L368: def fault_tolerance_state(self) -> FaultToleranceState: - - L375: def fault_tolerance_state(self, value: FaultToleranceState) -> None: - - L384: def signal_handler(self) -> DistributedSignalHandler: - - L391: def straggler_timer(self) -> StragglerDetector: - - L397: def initialize_async_checkpoint_worker(self) -> None: - - L425: def async_calls_queue(self) -> Optional[Any]: - - L430: def nvrx_straggler_manager(self) -> Optional[NVRxStragglerDetectionManager]: - - L443: def energy_monitor(self) -> Optional[EnergyMonitor]: - - L455: def _set_signal_handler(self) -> None: - - L460: def reset_for_restart(self) -> None: - -Functions: - - L480: def _timers_write_to_wandb( - self: Timers, - names: list[str], - writer: Any, - iteration: int, - normalizer: float = 1.0, - reset: bool = True, - barrier: bool = False, -) -> None: - - L501: def _timers_write_to_mlflow( - self: Timers, - names: list[str], - logger: Any, - iteration: int, - normalizer: float = 1.0, - reset: bool = True, - barrier: bool = False, -) -> None: - - L527: def _timers_write_to_comet( - self: Timers, - names: list[str], - logger: Any, - iteration: int, - normalizer: float = 1.0, - reset: bool = True, - barrier: bool = False, -) -> None: ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/builders/hf_dataset.py -Imports: - - import glob - - import json - - import logging - - import os - - import shutil - - from dataclasses import dataclass - - from pathlib import Path - - from typing import Any, Callable, Optional, Protocol, TypedDict, Union, cast - - from datasets import Dataset, DatasetDict, load_dataset - - from tqdm import tqdm - - from megatron.bridge.data.builders.finetuning_dataset import FinetuningDatasetBuilder - - from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs - - from megatron.bridge.data.datasets.sft import get_dataset_root - - from megatron.bridge.training.config import FinetuningDatasetConfig - - from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer - - from megatron.bridge.utils.common_utils import print_rank_0 ---- -Classes: - - ProcessExampleOutput - Properties: - - input - - output - - original_answers - - ProcessExampleFn - Methods: - - L58: def __call__(self, example: dict[str, Any], tokenizer: MegatronTokenizer | None = None) -> dict[str, Any]: - - HFDatasetConfig - Properties: - - dataset_name - - process_example_fn - - dataset_subset - - dataset_dict - - split - - download_mode - - val_proportion - - split_val_from_train - - delete_raw - - rewrite - - hf_kwargs - - hf_filter_lambda - - hf_filter_lambda_kwargs - - HFDatasetBuilder - Methods: - - L227: def __init__( - self, - dataset_name: str, - tokenizer, - process_example_fn: ProcessExampleFn, - dataset_dict: Optional[DatasetDict] = None, - dataset_subset: Optional[str] = None, - dataset_root: Optional[Union[str, Path]] = None, - split=None, - seq_length=1024, - seed: int = 1234, - memmap_workers: int = 1, - max_train_samples: Optional[int] = None, - packed_sequence_specs: Optional[PackedSequenceSpecs] = None, - download_mode: Optional[str] = None, - val_proportion: Optional[float] = 0.05, - split_val_from_train: bool = True, - rewrite: bool = True, - delete_raw: bool = False, - hf_kwargs: Optional[dict[str, Any]] = None, - dataset_kwargs: Optional[dict[str, Any]] = None, - hf_filter_lambda: Optional[Callable] = None, - hf_filter_lambda_kwargs: Optional[dict[str, Any]] = None, - do_validation: bool = True, - do_test: bool = True, - ) -> None: - - L318: def prepare_data(self) -> None: - - L347: def _load_dataset(self) -> DatasetDict: - -Functions: - - L101: def preprocess_and_split_data( - dset: DatasetDict, - dataset_name: str, - dataset_root: Path, - tokenizer: MegatronTokenizer, - process_example_fn: ProcessExampleFn, - split_val_from_train: bool = True, - val_proportion: Optional[float] = None, - train_aliases: tuple[str] = ("train", "training"), - test_aliases: tuple[str] = ("test", "testing"), - val_aliases: tuple[str] = ("val", "validation", "valid", "eval"), - delete_raw: bool = False, - seed: int = 1234, - rewrite: bool = False, - do_test: bool = True, - do_validation: bool = True, -): - -Global vars: - - logger ---- - - -File: /Users/mromeijn/src/Nemotron/src/nemotron/kit/megatron_stub.py -Imports: - - from dataclasses import dataclass, field - - from pathlib import Path ---- -Classes: - - DataConfig - Properties: - - data_path - - mock - - seq_length - - micro_batch_size - - global_batch_size - - ModelConfig - Properties: - - name - - num_layers - - hidden_size - - num_attention_heads - - ffn_hidden_size - - vocab_size - - OptimizerConfig - Properties: - - lr - - min_lr - - weight_decay - - adam_beta1 - - adam_beta2 - - TrainingConfig - Properties: - - max_steps - - log_interval - - eval_interval - - save_interval - - fp16 - - bf16 - - CheckpointConfig - Properties: - - dir - - save_on_train_end - - resume_from - - ConfigContainer - Properties: - - data - - model - - optimizer - - training - - checkpoint ---- - - -File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/legacy/model/gpt_model.py -Imports: - - import torch - - from typing import Optional - - from megatron.training import get_args - - from megatron.core import tensor_parallel - - from megatron.core.utils import deprecate_inference_params - - from .enums import AttnMaskType - - from .language_model import parallel_lm_logits - - from .language_model import get_language_model - - from .module import MegatronModule ---- -Classes: - - GPTModel - Methods: - - L48: def __init__(self, - config, - num_tokentypes=0, - parallel_output=True, - pre_process=True, - post_process=True): - - L74: def set_input_tensor(self, input_tensor): - - L78: def forward(self, input_ids, position_ids, attention_mask, - labels=None, tokentype_ids=None, inference_context=None, *, inference_params=None): - - L98: def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): - - L111: def load_state_dict(self, state_dict, strict=True): - -Functions: - - L18: def post_language_model_processing(lm_output, labels, logit_weights, - parallel_output, - fp16_lm_cross_entropy): ---- - - - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/README.md -```md -# Training and Customization - -This directory contains comprehensive documentation for training and customizing models with Megatron Bridge. Learn how to configure training, optimize performance, and customize training workflows. - -## Quick Navigation - -### I want to - -**🚀 Get started with training** -→ Start with [Configuration Container Overview](config-container-overview.md) to understand the training setup - -**⚙️ Configure training parameters** -→ See [Training Loop Settings](training-loop-settings.md) and [Optimizer & Scheduler](optimizer-scheduler.md) - -**📊 Monitor and profile training** -→ Check [Logging](logging.md) and [Profiling](profiling.md) guides - -**💾 Manage checkpoints** -→ Read [Checkpointing](checkpointing.md) for saving and resuming training - -**⚡ Optimize performance** -→ Explore [Performance Guide](../performance-guide.md) and [Performance Summary](../performance-summary.md) - -**🔧 Customize training** -→ See [PEFT](peft.md), [Distillation](distillation.md), [Entry Points](entry-points.md), and [Callbacks](callbacks.md) - -## Core Training Documentation - -### Configuration and Setup - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Configuration Container Overview](config-container-overview.md)** | Central configuration object for all training settings | First time setting up training | -| **[Entry Points](entry-points.md)** | Training entry points and execution flow | Understanding how training starts | -| **[Training Loop Settings](training-loop-settings.md)** | Training loop parameters and configuration | Configuring batch sizes, iterations, validation | - -### Optimization and Performance - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Optimizer & Scheduler](optimizer-scheduler.md)** | Optimizer and learning rate scheduler configuration | Setting up optimization | -| **[Mixed Precision](mixed-precision.md)** | Mixed precision training for memory efficiency | Reducing memory usage | -| **[Communication Overlap](communication-overlap.md)** | Overlapping communication with computation | Optimizing distributed training | -| **[Hybrid Context Parallel](hybrid-context-parallel.md)** | Hierarchical `a2a+p2p` context parallel guidance | Advanced long-sequence scaling | -| **[Attention Optimizations](attention-optimizations.md)** | Optimizing attention mechanisms | Improving training speed | -| **[Activation Recomputation](activation-recomputation.md)** | Gradient checkpointing strategies | Reducing memory footprint | -| **[CPU Offloading](cpu-offloading.md)** | Offloading to CPU for memory management | Working with limited GPU memory | - -### Monitoring and Debugging - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[Logging](logging.md)** | Logging configuration and TensorBoard/WandB integration | Monitoring training progress | -| **[Profiling](profiling.md)** | Performance profiling and analysis | Identifying bottlenecks | -| **[Resiliency](resiliency.md)** | Handling failures and recovery | Building robust training pipelines | - -### Advanced Features - -| Document | Purpose | When to Read | -|----------|---------|--------------| -| **[PEFT](peft.md)** | Parameter-Efficient Fine-Tuning (LoRA, etc.) | Fine-tuning with limited resources | -| **[Packed Sequences](packed-sequences.md)** | Sequence packing for efficiency | Optimizing data loading | -| **[Megatron FSDP](megatron-fsdp.md)** | Stable overview of Megatron FSDP | Choosing an FSDP path | -| **[Distillation](distillation.md)** | Knowledge distillation techniques | Transferring knowledge between models | -| **[Checkpointing](checkpointing.md)** | Checkpoint saving, loading, and resuming | Managing training state | -| **[Callbacks](callbacks.md)** | Inject custom logic into training loop | Custom logging, metrics, third-party integrations | - -## Training Workflow - -A typical training workflow involves: - -1. **Configure Training** - Set up `ConfigContainer` with model, data, and training parameters -2. **Prepare Data** - Configure dataset loading and preprocessing -3. **Set Optimization** - Configure optimizer, scheduler, and mixed precision -4. **Enable Monitoring** - Set up logging and profiling -5. **Configure Checkpointing** - Set up checkpoint saving and resuming -6. **Launch Training** - Start training with configured entry points -7. **Monitor Progress** - Track metrics via logging and profiling -8. **Resume if Needed** - Use checkpointing to resume from saved state - -## Related Documentation - -- **[Main Documentation Index](../index.md)** - Return to main documentation -- **[Performance Guide](../performance-guide.md)** - Comprehensive performance optimization guide -- **[Performance Summary](../performance-summary.md)** - Quick performance reference -- **[Recipe Usage](../recipe-usage.md)** - Using training recipes -- **[Parallelisms](../parallelisms.md)** - Understanding distributed training strategies -- **[Bridge Guide](../bridge-guide.md)** - Working with Hugging Face models - -## Common Training Scenarios - -### 🆕 First-Time Training Setup - -1. [Configuration Container Overview](config-container-overview.md) - Understand the configuration system -2. [Entry Points](entry-points.md) - Learn how to start training -3. [Training Loop Settings](training-loop-settings.md) - Configure basic training parameters -4. [Logging](logging.md) - Set up monitoring - -### ⚡ Performance Optimization - -1. [Performance Guide](../performance-guide.md) - Comprehensive optimization strategies -2. [Mixed Precision](mixed-precision.md) - Enable mixed precision training -3. [Communication Overlap](communication-overlap.md) - Optimize distributed training -4. [Activation Recomputation](activation-recomputation.md) - Reduce memory usage -5. [Profiling](profiling.md) - Identify bottlenecks - -### 💾 Production Training - -1. [Checkpointing](checkpointing.md) - Reliable checkpoint management -2. [Resiliency](resiliency.md) - Handle failures gracefully -3. [Logging](logging.md) - Comprehensive monitoring -4. [Profiling](profiling.md) - Performance analysis - -### 🔧 Customization - -1. [PEFT](peft.md) - Parameter-efficient fine-tuning -2. [Distillation](distillation.md) - Knowledge distillation -3. [Entry Points](entry-points.md) - Custom training workflows -4. [Callbacks](callbacks.md) - Inject custom logic (third-party integrations) - ---- - -**Ready to start training?** Begin with [Configuration Container Overview](config-container-overview.md) or return to the [main documentation](../README.md). - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/training/packed-sequences.md -```md -# Packed Sequences - -Packed sequences are a fine-tuning technique that reduces padding waste by -concatenating multiple examples into one pack while preserving sequence -boundaries for attention. In Megatron Bridge, this is primarily a supervised -fine-tuning and PEFT optimization rather than a general pretraining feature. - -This page is the stable overview for what packed sequences are, when to use -them, and which constraints are durable. For operational setup, code anchors, -and verification commands, see [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md). - -## What It Is - -Fine-tuning datasets often contain examples with highly variable lengths. When -those examples are batched conventionally, many tokens in each batch are just -padding. Packed sequences reduce that waste by building longer packs from -multiple examples and carrying boundary metadata into the attention path. - -In Bridge today, there are two distinct packing paths plus long-context -enablement through context parallelism: - -| Path | Use case | Key config | -|---|---|---| -| Offline packed SFT | Text-only finetuning | `packed_sequence_specs` | -| VLM in-batch packing | VLM finetuning | `pack_sequences_in_batch=True` | -| Long-context (CP) | Pretrain / finetune at 16K-128K+ | `context_parallel_size > 1` | - -These are related but they are not the same knob. Offline packed SFT and VLM -in-batch packing solve padding waste; long-context training primarily addresses -activation memory and communication tradeoffs at larger sequence lengths. - -## When to Use It - -Packed sequences are a good fit when all of the following are true: - -- you are doing SFT, PEFT, or VLM finetuning (all three packing paths are - supported; see the path table above) -- your examples have variable lengths and padding waste is significant -- you can tolerate the micro-batch constraints of packed training - -Packed sequences are usually not the right answer when: - -- you are doing standard Megatron-style pretraining, which already concatenates - documents during sampling -- you want long-context training in general, where context parallelism is often - the main technique -- your model family or recipe explicitly opts out of packed-sequence support - -## Stable Constraints - -The durable constraints for packed sequences in Bridge are: - -- packed SFT requires `micro_batch_size == 1` -- when context parallelism is used, sequence length must satisfy the standard - CP divisibility constraints -- for fine-tuning with CP enabled, per-token loss behavior and reduction - settings matter -- CUDA-graph-friendly packed metadata requires additional padding constraints - -Model-family support is not universal. Some families and recipe paths explicitly -opt out of packed sequences or related packing modes. - -## Relationship to Long-Sequence Training - -Packed sequences and long-sequence training are often mentioned together because -both affect sequence layout and memory behavior, but they solve different -problems: - -- packed sequences mainly reduce padding waste in fine-tuning datasets -- long-sequence training mainly addresses activation memory and communication - tradeoffs at larger sequence lengths - -For long-sequence training guidance, see: - -- `docs/performance-guide.md` -- `docs/training/hybrid-context-parallel.md` - -## Practical Caveats - -The most stable caveats to remember are: - -1. Packed-sequence support is recipe- and model-family-specific. -2. Fine-tuning sequence packing should not be assumed to work with every other - training feature. -3. Packed sequences improve efficiency primarily by reducing padding waste, not - by replacing long-context parallelism or memory-planning techniques. - -## Related Docs - -- [docs/training/multi-token-prediction.md](multi-token-prediction.md) -- [docs/performance-guide.md](../performance-guide.md) -- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md) -- [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md) -- [skills/perf-techniques/sequence-packing/card.yaml](../skills/perf-techniques/sequence-packing/card.yaml) -- [skills/perf-techniques/packed-sequences-long-context/SKILL.md](../skills/perf-techniques/packed-sequences-long-context/SKILL.md) -- [skills/perf-techniques/packed-sequences-long-context/card.yaml](../skills/perf-techniques/packed-sequences-long-context/card.yaml) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/parallelisms.md -```md -# Parallelisms Guide - -Megatron Bridge supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. These parallelism strategies are configured through model provider classes and leverage Megatron Core's implementation for performance and memory efficiency. - -## Data Parallelism - -Data Parallelism (DP) replicates the model across multiple GPUs. Data batches are evenly distributed between GPUs and the data-parallel GPUs process them independently. While the computation workload is efficiently distributed across GPUs, inter-GPU communication is required to keep the model replicas consistent between training steps. - -### Distributed Data Parallelism - -Distributed Data Parallelism (DDP) keeps the model copies consistent by synchronizing parameter gradients across data-parallel GPUs before each parameter update. More specifically, it sums the gradients of all model copies using all-reduce communication collectives. - -![Distributed Data Parallelism](images/ddp.gif) -*Figure: Distributed Data Parallelism synchronizes gradients across multiple GPUs using all-reduce operations.* - -### Distributed Optimizer - -[Distributed optimizer](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/dist_optimizer.html) is a memory-optimized data-parallel deployment method. It shards the optimizer states and the high-precision master parameters across data-parallel GPUs instead of replicating them. At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. This approach significantly reduces the memory need of large-scale LLM training. - -### Enable Data Parallelism - -In Megatron Bridge, DDP is the default parallel deployment method. The total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group. - -To enable the distributed optimizer, configure the {py:class}`bridge.training.config.OptimizerConfig` and {py:class}`bridge.training.config.DistributedDataParallelConfig` - -```python -from megatron.bridge.training.config import ConfigContainer, DistributedDataParallelConfig, OptimizerConfig - -optimizer_config = OptimizerConfig( - optimizer="adam", - lr=3e-4, - weight_decay=0.1, - adam_beta1=0.9, - adam_beta2=0.95, - use_distributed_optimizer=True, - clip_grad=1.0, -) -ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True) - -config = ConfigContainer( - ddp=ddp_config, - optimizer=optimizer_config, - # ... other config parameters -) -``` - -For more optimizer options, refer to the {py:class}`bridge.training.config.OptimizerConfig` API documentation. - -## Model Parallelism - -Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need for per-GPU memory. Megatron Bridge supports various model-parallel methods through Megatron Core, which can be mixed to maximize LLM training performance. - -### Tensor Parallelism - -Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads. - -![Tensor Parallelism Overview](images/tp1.png) -*Figure 1: Tensor Parallelism distributes individual layer parameters across multiple GPUs.* - -![Tensor Parallelism Implementation](images/tp2.png) -*Figure 2: Detailed view of how tensor parallelism splits weight matrices and synchronizes computations.* - -#### Enable Tensor Parallelism - -To enable TP in Megatron Bridge, configure the `tensor_model_parallel_size` parameter in your model provider. This parameter determines the number of GPUs among which the model's tensors are partitioned. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer - -# Configure model with tensor parallelism -model_config = GPTModelProvider( - tensor_model_parallel_size=2, # Enable TP across 2 GPUs - # ... other model parameters -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Implement Tensor Parallelism - -Megatron Bridge integrates TP through the implementation from Megatron Core. For detailed API usage and additional configurations, consult the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.tensor_parallel.html). - -### Pipeline Parallelism - -Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially. - -![Pipeline Parallelism](images/pp.gif) -*Figure: Pipeline Parallelism distributes consecutive layers across multiple GPUs, processing batches in a pipeline fashion.* - -#### Enable Pipeline Parallelism - -To utilize Pipeline Parallelism in Megatron Bridge, set the `pipeline_model_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs among which the model's layers are distributed. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer - -# Configure model with pipeline parallelism -model_config = GPTModelProvider( - pipeline_model_parallel_size=4, # Distribute layers across 4 GPUs - # ... other model parameters -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Interleaved Pipeline Parallel Schedule - -To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. Enable this by setting `virtual_pipeline_model_parallel_size`: - -```python -model_config = GPTModelProvider( - pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=2, # 2 model chunks per pipeline stage - # ... other model parameters -) -``` - -For more insights into this approach, see the detailed blog: [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism). - -#### Implement Pipeline Parallelism - -The Megatron Bridge implementation of PP leverages functionalities from Megatron Core. For more detailed API usage and configurations related to PP, visit the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.pipeline_parallel.html). - -### Expert Parallelism and Mixture of Experts (MoE) - -Expert Parallelism (EP) is a type of model parallelism that distributes experts of a Mixture of Experts (MoE) model across GPUs. Unlike other model-parallel techniques, EP is applied to only the expert layers and does not impact the parallel mapping of the rest of the layers. - -MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert focuses on a specific subtask or domain, while a gating network dynamically activates the most appropriate expert based on the current input. - -![Expert Parallelism](images/ep.png) -*Figure: Expert Parallelism distributes MoE experts across multiple GPUs while keeping other layers replicated.* - -#### Basic MoE Configuration - -To enable MoE in Megatron Bridge, configure the basic MoE parameters in your model provider: - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure basic MoE model -model_config = GPTModelProvider( - num_moe_experts=8, # Number of experts in the MoE module - moe_router_topk=2, # Number of experts activated per token - moe_ffn_hidden_size=8192, # Hidden size for expert FFN layers - # ... other model parameters -) -``` - -#### Enable Expert Parallelism - -To enable EP, set `expert_model_parallel_size` in your model configuration. For example, if the model has eight experts (`num_moe_experts=8`), then setting `expert_model_parallel_size=4` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size. - -```python -# Configure MoE model with expert parallelism -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, # Distribute 8 experts across 4 GPUs (2 experts per GPU) - # ... other model parameters -) -``` - -#### Enable Expert Tensor Parallelism - -To enable Expert Tensor Parallelism (ETP), set `expert_tensor_parallel_size` in your model configuration: - -```python -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - expert_tensor_parallel_size=2, # Apply tensor parallelism within each expert - # ... other model parameters -) -``` - -#### Advanced MoE Features - -Megatron Bridge provides several advanced optimizations for MoE models to improve performance on modern GPU architectures. - -##### DeepEP and HybridEP Optimizations - -DeepEP and HybridEP are high-performance MoE token dispatchers that improve throughput and efficiency on specific GPU architectures: - -- **DeepEP**: Optimized for Ampere, Hopper, B200, and B300 GPUs -- **HybridEP**: Optimized for GB200, GB300 with NVL72, and Ampere, Hopper, B200, B300 GPUs - -These dispatchers replace the standard token routing mechanism with an optimized "flex" dispatcher that provides better performance for MoE workloads. - -**Enable DeepEP:** - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend - -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - # ... other model parameters -) - -# Apply DeepEP optimization -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep") -``` - -**Enable HybridEP:** - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend - -model_config = GPTModelProvider( - num_moe_experts=8, - expert_model_parallel_size=4, - # ... other model parameters -) - -# Apply HybridEP optimization -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="hybridep") -``` - -**GPU Architecture Requirements:** - -- **DeepEP**: Ampere (SM 8.x), Hopper (SM 9.x), B200, B300 -- **HybridEP**: GB200, GB300 with NVL72, Ampere (SM 8.x), Hopper (SM 9.x), B200, B300 - -The system automatically validates GPU compatibility and issues warnings if the dispatcher is not supported on the current hardware. - -##### Token Dropping for Load Balancing - -Token dropping improves MoE performance by balancing work across experts through capacity factors. This feature allows the model to drop tokens when experts are overloaded, preventing stragglers and improving overall throughput. - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop - -model_config = GPTModelProvider( - num_moe_experts=8, - moe_router_topk=2, - moe_token_dispatcher_type="alltoall", # Required for token dropping - moe_router_load_balancing_type="aux_loss", # Required load balancing type - # ... other model parameters -) - -# Apply token dropping with capacity factor -apply_moe_token_drop( - model_config, - moe_expert_capacity_factor=1.0, # Capacity multiplier per expert - moe_pad_expert_input_to_capacity=True, # Pad inputs to capacity length -) -``` - -**Configuration Parameters:** - -- `moe_expert_capacity_factor`: Controls the maximum number of tokens each expert can process. A factor of 1.0 means each expert can handle exactly its proportional share of tokens. Lower values (e.g., 0.8) drop more tokens but improve load balancing. -- `moe_pad_expert_input_to_capacity`: When enabled, pads expert inputs to the capacity length for consistent batch sizes. - -**Requirements:** - -- Token dispatcher must be `alltoall` or `alltoall_seq` -- Load balancing type must be `aux_loss`, `seq_aux_loss`, or `none` - -**Trade-offs:** - -Token dropping can improve training throughput by 10-30% in imbalanced MoE models, but may affect convergence if too aggressive. Start with a capacity factor of 1.0 and gradually reduce if needed. - -#### Complete MoE Configuration Example - -Here's a complete example showing how to configure an MoE model with advanced optimizations: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer -from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend -from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop - -# Configure MoE model with expert parallelism -model_config = GPTModelProvider( - num_layers=32, - hidden_size=4096, - num_attention_heads=32, - - # MoE configuration - num_moe_experts=8, # 8 experts total - moe_router_topk=2, # Activate 2 experts per token - moe_ffn_hidden_size=8192, # Expert FFN hidden dimension - moe_token_dispatcher_type="alltoall", # Token dispatcher type - moe_router_load_balancing_type="aux_loss", # Load balancing - - # Expert parallelism - expert_model_parallel_size=4, # Distribute experts across 4 GPUs - expert_tensor_parallel_size=2, # Apply TP within each expert - - # ... other model parameters -) - -# Apply DeepEP optimization (for Ampere/Hopper GPUs) -apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep") - -# Apply token dropping for load balancing -apply_moe_token_drop( - model_config, - moe_expert_capacity_factor=1.0, - moe_pad_expert_input_to_capacity=True, -) - -config = ConfigContainer( - model=model_config, - # ... other config parameters -) -``` - -#### Expert Parallelism Implementation - -The Megatron Bridge implementation of EP uses functionality from Megatron Core. Please consult the [Megatron Core MoE layer](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/moe_layer.py#L42) for more MoE implementation details. - -## Activation Partitioning - -In LLM training, a large memory space is needed to store the input activations of the network layers. Megatron Bridge provides effective activation distribution methods through Megatron Core, which is critical in training LLMs with large sequence lengths or large per-GPU micro-batch sizes. - -### Sequence Parallelism - -Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency. - -![Sequence Parallelism](images/sp.png) -*Figure: Sequence Parallelism distributes the sequence dimension across multiple GPUs, reducing activation memory.* - -#### Enable Sequence Parallelism - -To utilize SP in Megatron Bridge, set the `sequence_parallel` parameter to `True` in your model configuration. Note that this feature is effective only when the tensor parallel size (`tensor_model_parallel_size`) is greater than `1`. - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure model with sequence parallelism -model_config = GPTModelProvider( - tensor_model_parallel_size=2, # Required for sequence parallelism - sequence_parallel=True, # Enable sequence parallelism - # ... other model parameters -) -``` - -#### Implement Sequence Parallelism - -The Megatron Bridge implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code: [Megatron-LM Sequence Parallel Source Code](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py). - -### Context Parallelism - -Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs by partitioning the input tensors along the sequence dimension. Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers. - -CP is critical for training long context models, as it allows the model to handle longer sequences by distributing the sequence activations across multiple GPUs. This method reduces the memory footprint and computational cost of processing long sequences. - -#### Enable Context Parallelism - -To activate CP in Megatron Bridge, set the `context_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs across which the model's sequence activations are distributed. - -```python -from megatron.bridge.models import GPTModelProvider - -# Configure model with context parallelism -model_config = GPTModelProvider( - context_parallel_size=2, # Distribute sequence across 2 GPUs - # ... other model parameters -) -``` - -For long context training scenarios, context parallelism is particularly effective and essential for handling sequences that exceed the memory capacity of individual GPUs. - -#### Implement Context Parallelism - -Megatron Bridge leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency. - -For more detailed technical information and implementation details, visit: -- [Megatron Core Context Parallelism Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/context_parallel.html) -- [Megatron Core wrappers for Transformer Engine](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py) -- [Transformer Engine attention modules](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py) - -## Combined Parallelism Example - -Megatron Bridge allows you to combine multiple parallelism strategies for optimal performance and memory efficiency: - -```python -from megatron.bridge.models import GPTModelProvider -from megatron.bridge.training.config import ConfigContainer, OptimizerConfig - -# Configure model with multiple parallelism strategies -model_config = GPTModelProvider( - # Model parallelism - tensor_model_parallel_size=2, # 2-way tensor parallelism - pipeline_model_parallel_size=4, # 4-way pipeline parallelism - virtual_pipeline_model_parallel_size=2, # Interleaved pipeline - - # Activation partitioning - sequence_parallel=True, # Enable sequence parallelism (requires TP > 1) - context_parallel_size=2, # 2-way context parallelism - - # Expert parallelism (for MoE models) - num_moe_experts=8, # 8 experts - expert_model_parallel_size=4, # Distribute experts across 4 GPUs - - # ... other model parameters -) - -# Configure distributed optimizer -optimizer_config = OptimizerConfig( - optimizer="adam", - use_distributed_optimizer=True, # Enable distributed optimizer - # ... other optimizer parameters -) - -config = ConfigContainer( - model=model_config, - optimizer=optimizer_config, - # ... other config parameters -) -``` - -## Data Parallel Size Calculation - -The data parallel size is automatically calculated based on the total world size and model parallelism settings: - -``` -data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size) -``` - -For example, with 32 GPUs total and the configuration above: -- `tensor_model_parallel_size = 2` -- `pipeline_model_parallel_size = 4` -- `context_parallel_size = 2` -- `data_parallel_size = 32 / (2 × 4 × 2) = 2` - -## Strategy Selection Guide - -Choosing the right combination depends on model size, hardware topology, -and sequence length. - -### Dense Models by Size - -| Model size | GPUs | Recommended starting point | -|---|---|---| -| < 1B | 1-8 | DP only | -| 1-10B | 8-16 | TP=2-4 + DP | -| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP | -| 70-175B | 64-256 | TP=8 + PP=4-8 + DP | -| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP | - -### MoE Models - -MoE models differ fundamentally from dense models: only a fraction of -parameters are active per token, so TP can often stay at 1 or 2. EP is -the primary scaling dimension. - -| Total / active params | Typical layout | -|---|---| -| < 20B | EP only (TP=1, PP=1) | -| 20-100B | TP=1-2 + PP=2-4 + EP=8-16 | -| 100-500B | TP=2-4 + PP=8-16 + EP=8-32 | -| 500B+ | TP=2 + PP=16 + EP=32-64 | - -### By Hardware Topology - -- **Single node with NVLink**: maximize TP within the node (up to TP=8). -- **Multiple nodes with InfiniBand**: keep TP within a node, use PP across nodes. -- **Limited network (Ethernet)**: minimize TP, prefer PP for cross-node scaling. - -### By Sequence Length - -| Sequence length | Recommendation | -|---|---| -| < 2K | standard TP + PP + DP | -| 2K-8K | add SP (`sequence_parallel=True`) | -| 8K-32K | add CP=2 | -| 32K+ | add CP=4-8, consider hierarchical CP | - -For operational details on configuring combined parallelism, troubleshooting -layouts, and memory estimation, see the -[parallelism strategies skill](skills/perf-techniques/parallelism-strategies/SKILL.md). - -## Configuration Guidelines - -### Memory Optimization -- Use **distributed optimizer** to reduce optimizer state memory -- Enable **sequence parallelism** when using tensor parallelism to reduce activation memory -- Use **context parallelism** for long sequence training -- Consider **pipeline parallelism** for very large models that don't fit on a single GPU - -### Performance Optimization -- **Tensor parallelism** works best within a single node (high bandwidth) -- **Pipeline parallelism** can work across nodes but requires careful batch size tuning -- **Context parallelism** is essential for long context scenarios -- **Expert parallelism** is specific to MoE models and should match the number of experts -- **DeepEP/HybridEP** provide optimized MoE token dispatching on supported GPU architectures - -### Compatibility -- **Sequence parallelism** requires `tensor_model_parallel_size > 1` -- **Expert parallelism** requires MoE models (`num_moe_experts > 0`) -- **DeepEP** requires Ampere, Hopper, B200, or B300 GPUs -- **HybridEP** requires GB200, GB300 with NVL72, or Ampere, Hopper, B200, B300 GPUs -- **Token dropping** requires `alltoall` or `alltoall_seq` token dispatcher -- All parallelism strategies can be combined, but total parallelism must divide evenly into the world size - -## Related Artifacts - -- **Operational skill**: [skills/perf-techniques/parallelism-strategies/SKILL.md](skills/perf-techniques/parallelism-strategies/SKILL.md) — enablement, pitfalls, memory estimation, verification -- **Knowledge card**: [skills/perf-techniques/parallelism-strategies/card.yaml](skills/perf-techniques/parallelism-strategies/card.yaml) — structured metadata and validation status - -## Resources - -- [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/) -- [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/) -- [Megatron-LM Repository](https://github.com/NVIDIA/Megatron-LM) -- [Transformer Engine](https://github.com/NVIDIA/TransformerEngine) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/bridge-guide.md -```md -# Get Started with 🤗 Hugging Face Conversion - -Megatron Bridge provides seamless bidirectional conversion between 🤗 Hugging Face Transformers and Megatron model definitions. This guide covers the main APIs for loading models, checking compatibility, and converting between formats. - -## Design and Goals - -- Single high-level entry point: `AutoBridge` detects HF model architectures and dispatches to the correct bridge. -- Bidirectional conversion: Import HF → Megatron for training; export Megatron → HF for deployment. -- Parallelism-aware: Handles TP/PP/VPP/CP/EP/ETP distributions during conversion. -- Streaming and memory efficiency: per-parameter streaming using safetensors. -- Provider pattern: Configure Megatron-Core `TransformerConfig`-compatible attributes before instantiation via `to_megatron_provider()`. -- Convenience workflows: `import_ckpt` and `export_ckpt` provide one-call HF↔Megatron checkpoint flows. - -See the repository `README.md` for installation, supported models, and project highlights. - -## Loading a 🤗 Hugging Face Model into Megatron - -The easiest way to load a 🤗 Hugging Face model is using `AutoBridge.from_hf_pretrained()`, which automatically detects the model architecture and selects the appropriate bridge for conversion. You can then use `AutoBridge.to_megatron_model()` to initialize the Megatron model from the 🤗 Hugging Face configuration and load 🤗 Hugging Face weights at the same time. - -### Accessing Gated 🤗 Hugging Face Models - -Some models in Megatron Bridge require access to gated repositories on Hugging Face. These are models that require explicit permission from the model authors before you can download or use them. - -If you encounter an error like this when trying to use a model: - -``` -OSError: You are trying to access a gated repo. -Make sure to have access to it at -``` - -Follow these steps to resolve the issue: - -1. **Request access**: Visit the URL provided in the error message and request access to the gated model -2. **Generate a token**: Create a Hugging Face access token by following [this tutorial](https://huggingface.co/docs/hub/en/security-tokens#how-to-manage-user-access-tokens) -3. **Set the environment variable**: Export your token in your environment: - -```bash -export HF_TOKEN= -``` - -### Basic Usage - -```python -from megatron.bridge import AutoBridge - -# Load a supported model automatically -bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B") - -# Create a provider, configure before instantiation, then build the model -provider = bridge.to_megatron_provider() -provider.tensor_model_parallel_size = 1 -provider.pipeline_model_parallel_size = 1 -provider.finalize() -megatron_model = provider.provide_distributed_model(wrap_with_ddp=False) -``` - -### Advanced Loading Options -You can also load models with specific settings such as precision, device placement, or by enabling trust in remote code: - -```python -import torch -from megatron.bridge import AutoBridge - -# Load with specific settings -bridge = AutoBridge.from_hf_pretrained( - "meta-llama/Llama-2-7b-hf", - torch_dtype=torch.float16, - device_map="auto", - trust_remote_code=True, -) - -# Load from local path -bridge = AutoBridge.from_hf_pretrained("/path/to/local/hf_model") -``` - -### Using Model Providers - -For more control over model configuration, use the provider pattern. The provider lets you configure any `TransformerConfig` attribute: - -```python -from megatron.bridge import AutoBridge - -# Load a supported model automatically -bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B") - -# Get a model provider (lazy loading) -provider = bridge.to_megatron_provider() - -# Configure parallelism (multi-GPU requires torchrun or srun) -provider.tensor_model_parallel_size = 8 -provider.pipeline_model_parallel_size = 2 - -# Configure fusions -provider.bias_activation_fusion = True -provider.bias_dropout_fusion = True - -# Finalize the provider to run validation checks and complete initialization -provider.finalize() - -# Create the model with all configurations applied -model = provider.provide_distributed_model(wrap_with_ddp=False) -``` - -The provider pattern is especially useful when you need to: -- Override default model parameters -- Configure advanced features like MoE, activation recomputation, or mixed precision -- Set up distributed training parameters - -## Check Supported Models - -Before loading a model, you can check if it's supported by Megatron Bridge. - -You can list all supported 🤗 Hugging Face model architectures as follows: - -```python -from megatron.bridge import AutoBridge - -# Get a list of all supported model architectures -supported_models = AutoBridge.list_supported_models() - -print(f"Found {len(supported_models)} supported models:") -for i, model in enumerate(supported_models, 1): - print(f" {i:2d}. {model}") -``` - -Alternatively, check if a specific model is supported: - -```python -from megatron.bridge import AutoBridge - -if AutoBridge.can_handle("meta-llama/Llama-3.2-1B"): - print("✅ Model is supported!") - bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B") -else: - print("❌ Model requires a custom bridge implementation") -``` - -## Converting Back to 🤗 Hugging Face - -After training or modifying a Megatron model, you can convert it back to 🤗Hugging Face format for deployment or sharing. The bridge provides several methods for this conversion depending on your needs. - -To save the complete model including configuration, tokenizer, and weights: - -```python -# Save the complete model (config, tokenizer, weights) -bridge.save_hf_pretrained(megatron_model, "./my-fine-tuned-llama") - -# The saved model can be loaded with 🤗 Hugging Face -from transformers import AutoModelForCausalLM -hf_model = AutoModelForCausalLM.from_pretrained("./my-fine-tuned-llama") -``` - -You can save the model weights (safetensors): - -```python -# Save just the model weights (faster, smaller) -bridge.save_hf_weights(megatron_model, "./model_weights") - -# Save without progress bar (useful in scripts) -bridge.save_hf_weights(megatron_model, "./weights", show_progress=False) -``` - -You can also stream weights without saving to disk during conversion for on-the-fly use in RL frameworks, for example: - -```python -# Stream weights during conversion (memory efficient) -for name, weight in bridge.export_hf_weights(megatron_model): - print(f"Exporting {name}: {weight.shape}") - -for name, weight in bridge.export_hf_weights(megatron_model, cpu=True): - print(f"Exported {name}: {tuple(weight.shape)}") -``` - -## Common Patterns and Best Practices -When working with Megatron Bridge, there are several patterns that will help you use the API effectively and avoid common pitfalls. - -### 1. Always Use High-Level APIs -Always prefer high-level APIs like `AutoBridge` for automatic model detection. Avoid direct bridge usage unless you know the specific type required: - -```python -# ✅ Preferred: Use AutoBridge for automatic detection -bridge = AutoBridge.from_hf_pretrained("any-supported-model") - -# ❌ Avoid: Direct bridge usage unless you know the specific type -``` - -### 2. Configure Before Creating Models -When using the provider pattern, always configure parallelism and other settings before creating the model. Creating the model first uses default settings that may not be optimal: - -```python -# ✅ Correct: Configure provider before creating model -provider = bridge.to_megatron_provider() -provider.tensor_model_parallel_size = 8 -provider.finalize() -model = provider.provide_distributed_model(wrap_with_ddp=False) - -# ❌ Avoid: Creating model before configuring parallelism -model = bridge.to_megatron_model() # Uses default settings -``` - -### 3. Leverage the Parameter Streaming API -You can stream converted weights from Megatron to HF without saving to disk: - -```python -# ✅ Use streaming for large models -for name, weight in bridge.export_hf_weights(model, cpu=True): - process_weight(name, weight) -``` - -### 4. Use `from_hf_pretrained` for Export Workflows - -When exporting Megatron checkpoints back to 🤗 Hugging Face format, always use `from_hf_pretrained()` instead of `from_hf_config()`. The `from_hf_config()` method does not load the tokenizer and other artifacts required for saving a complete 🤗 Hugging Face checkpoint: - -```python -from megatron.bridge import AutoBridge - -# ✅ Correct: Use from_hf_pretrained for export workflows -bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B") -bridge.export_ckpt("./megatron_checkpoints/llama32_1b", "./hf_exports/llama32_1b") - -# ❌ Avoid: from_hf_config lacks artifacts needed for saving -# config = AutoConfig.from_pretrained("meta-llama/Llama-3.2-1B") -# bridge = AutoBridge.from_hf_config(config) # Missing tokenizer, etc. -# bridge.export_ckpt(...) # Will fail! -``` - -The `from_hf_config()` method is only suitable for architecture exploration and introspection (e.g., inspecting `transformer_config`), not for checkpoint conversion workflows. - -For more examples and advanced usage patterns, see the `examples/conversion/` directory in the repository. - -## Convenience Workflows (Commands) - -These examples can be run directly as shell commands. - -### HF → Megatron checkpoint import (one call) - -```bash -huggingface-cli login --token -python -c "from megatron.bridge import AutoBridge; AutoBridge.import_ckpt('meta-llama/Llama-3.2-1B','./megatron_checkpoints/llama32_1b')" -``` - -### Megatron → HF export (one call) - -```bash -python -c "from megatron.bridge import AutoBridge; b=AutoBridge.from_hf_pretrained('meta-llama/Llama-3.2-1B'); b.export_ckpt('./megatron_checkpoints/llama32_1b','./hf_exports/llama32_1b')" -``` - -### Create Megatron models and run locally - -```bash -python - << 'PY' -from megatron.bridge import AutoBridge - -bridge = AutoBridge.from_hf_pretrained('meta-llama/Llama-3.2-1B') -provider = bridge.to_megatron_provider() -provider.tensor_model_parallel_size = 1 -provider.pipeline_model_parallel_size = 1 -provider.finalize() -model = provider.provide_distributed_model(wrap_with_ddp=False) - -# Export to HF folder -bridge.save_hf_pretrained(model, './hf_exports/llama32_1b') -PY -``` - -### Launch with multiple GPUs (example) - -```bash -torchrun --nproc-per-node=2 -m examples.conversion.generate_from_hf -``` - -## AutoBridge API Reference - -Latest public APIs and signatures (see {doc}`apidocs/bridge/bridge.models.conversion.auto_bridge`): - -```python -from megatron.bridge import AutoBridge - -# Creation and capability -AutoBridge.from_hf_pretrained(path: str | Path, **kwargs) -> AutoBridge -AutoBridge.from_hf_config(config: PretrainedConfig) -> AutoBridge -AutoBridge.can_handle(path: str | Path, trust_remote_code: bool = False) -> bool -AutoBridge.list_supported_models() -> list[str] -AutoBridge.supports(config: Any) -> bool - -# Provider/model construction -AutoBridge.to_megatron_provider(load_weights: bool = True, hf_path: str | Path | None = None) -> GPTModelProvider -AutoBridge.to_megatron_model(load_weights: bool = True, hf_path: str | Path | None = None, **kwargs) -> list[MegatronModule] - -# HF → Megatron weights -AutoBridge.load_hf_weights(model: list[MegatronModule], hf_path: str | Path | None = None) -> None - -# Megatron → HF conversion -AutoBridge.export_hf_weights(model: list[MegatronModule], cpu: bool = False, show_progress: bool = True, conversion_tasks: Optional[list[WeightConversionTask]] = None) -> Iterable[HFWeightTuple] -AutoBridge.save_hf_pretrained(model: list[MegatronModule], path: str | Path, show_progress: bool = True) -> None -AutoBridge.save_hf_weights(model: list[MegatronModule], path: str | Path, show_progress: bool = True) -> None - -# Megatron native checkpoints -AutoBridge.save_megatron_model(model: list[MegatronModule], path: str | Path) -> None -AutoBridge.load_megatron_model(path: str | Path, **kwargs) -> list[MegatronModule] - -# One-call workflows -AutoBridge.import_ckpt(hf_model_id: str | Path, megatron_path: str | Path, **kwargs) -> None # HF → Megatron ckpt -AutoBridge.export_ckpt(megatron_path: str | Path, hf_path: str | Path, show_progress: bool = True) -> None # Megatron → HF - -# Config extraction -AutoBridge.transformer_config -> TransformerConfig -AutoBridge.mla_transformer_config -> MLATransformerConfig - -# Introspection / planning -AutoBridge.get_conversion_tasks(megatron_model: MegatronModule | list[MegatronModule], hf_path: str | Path | None = None) -> list[WeightConversionTask] -``` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/docs/recipe-usage.md -```md -# Using Recipes - -Megatron Bridge provides production-ready training recipes for several popular models. You can find an overview of supported recipes and 🤗 HuggingFace bridges [here](index.md#supported-models). -This guide will cover the next steps to make use of a training recipe, including how to [override configuration](#overriding-configuration) and how to [launch a job](#launch-methods). - -## Overview - -- **Coverage**: We provide recipes across select model families and sizes, including Llama, Qwen, DeepSeek, and Nemotron-H (Mamba-based). -- **Defaults**: Each recipe sets defaults meant for convergence and performance across parallelisms, precision data types, and optimizer & scheduler choices. These recipes can be used as a high-quality starting point. -- **Integration**: Recipes return a single `ConfigContainer` that plugs directly into our training [entry points](training/entry-points.md) (see the published docs as well: https://docs.nvidia.com/nemo/megatron-bridge/latest/training/entry-points.html). -- **Customization**: You can override any part of the recipe (Python, YAML, CLI) to adapt to your data, scale, and objectives. - -## Overriding configuration - -Recipes are provided through a {py:class}`~bridge.training.config.ConfigContainer` object. This is a dataclass that holds all configuration objects needed for training. You can find a more detailed overview of the `ConfigContainer` [here](training/config-container-overview.md). -The benefit of providing the full recipe through a pythonic structure is that it is agnostic to any configuration approach that a user may prefer, whether that's YAML, `argparse` or something else. In other words, the user may override the recipe however they see fit. - -The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b.py). - - -### Python - -If you prefer to manage configuration in Python, you can directly modify attributes of the `ConfigContainer`: - -```python -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config - -# Get the base ConfigContainer from the recipe -cfg: ConfigContainer = pretrain_config() - -# Apply overrides. Note the hierarchical structure -cfg.train.train_iters = 20 -cfg.train.global_batch_size = 8 -cfg.train.micro_batch_size = 1 -cfg.logger.log_interval = 1 -``` - -You can also replace entire sub-configs of the `ConfigContainer`: - -```python -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config -from megatron.bridge.models.llama import Llama3ModelProvider - -cfg: ConfigContainer = pretrain_config() - -small_llama = Llama3ModelProvider( - num_layers=2, - hidden_size=768, - ffn_hidden_size=2688, - num_attention_heads=16, -) -cfg.model = small_llama -``` - -### YAML -Overriding a configuration recipe with a YAML file can be done using OmegaConf utilities: - -```python -from omegaconf import OmegaConf -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, -) - -cfg: ConfigContainer = pretrain_config() -yaml_filepath = "conf/llama3-8b-benchmark-cfg.yaml" - -# Convert the initial Python dataclass to an OmegaConf DictConfig for merging -# excluded_fields holds some configuration that cannot be serialized into a DictConfig -merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - -# Load and merge YAML overrides -yaml_overrides_omega = OmegaConf.load(yaml_filepath) -merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega) - -# Apply overrides while preserving excluded fields -final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) -apply_overrides(cfg, final_overrides_as_dict, excluded_fields) -``` - -The above snippet will update `cfg` with all overrides from `llama3-8b-benchmark-cfg.yaml`. - -### Hydra-style - -Megatron Bridge provides some utilities to update the ConfigContainer using Hydra-style CLI overrides: - -```python -import sys -from omegaconf import OmegaConf -from megatron.bridge.recipes.llama.llama3_8b import pretrain_config -from megatron.bridge.training.utils.omegaconf_utils import ( - apply_overrides, - create_omegaconf_dict_config, - parse_hydra_overrides, -) - -cfg: ConfigContainer = pretrain_config() -cli_overrides = sys.argv[1:] - -# Convert the initial Python dataclass to an OmegaConf DictConfig for merging -# excluded_fields holds some configuration that cannot be serialized into a DictConfig -merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg) - -# Parse and merge CLI overrides -merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides) - -# Apply overrides while preserving excluded fields -final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True) -apply_overrides(cfg, final_overrides_as_dict, excluded_fields) -``` - -After the above snippet, `cfg` will be updated with all CLI-provided overrides. -A script containing the above code could be called like so: - -```sh -torchrun pretrain_cli_overrides.py model.tensor_model_parallel_size=4 train.train_iters=100000 ... -``` - -## Launch methods - -Megatron Bridge supports launching scripts with both `torchrun` and [NeMo-Run](https://github.com/NVIDIA-NeMo/Run). -Once your script is ready to be launched, refer to one of the following sections. - -### Torchrun -Megatron Bridge training scripts can be launched with the `torchrun` command that most PyTorch users are familiar with. -Simply specify the number of GPUs to use with `--nproc-per-node` and the number of nodes with `--nnodes`. For example, on a single node: - -```sh -torchrun --nnodes 1 --nproc-per-node 8 /path/to/train/script.py -``` - -For multi-node training, it is recommended to use a cluster orchestration system like SLURM. -The `torchrun` command should be wrapped as specified by your cluster orchestration system. -For example, with Slurm, wrap the `torchrun` command inside of `srun`: - -```sh -# launch.sub - -srun --nodes 2 --gpus-per-node 8 \ - --container-image --container-mounts \ - bash -c " - torchrun --nnodes $SLURM_NNODES --nproc-per-node $SLURM_GPUS_PER_NODE /path/to/train/script.py - " -``` - -Along with any other required flags. It is also recommended to use a NeMo Framework container with Slurm. You can find a list of container tags on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags). - -### NeMo-Run - -Megatron Bridge also supports launching training with [NeMo-Run](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html). NeMo-Run is a Python package that enables configuring and executing experiments across several platforms. -For multi-node training, NeMo-Run will generate a script with appropriate commands, similar to the `srun` command described above. - -The recommended method to launch a Megatron Bridge script with NeMo-Run is through the `run.Script` API. -You can modify the following 3 steps to your needs in a new file: - -```python -import nemo_run as run - -if __name__ == "__main__": - # 1) Configure the `run.Script` object - train_script = run.Script(path="/path/to/train/script.py", entrypoint="python") - - # 2) Define an executor for the desired target platform - executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") - - # 3) Execute - run.run(train_script, executor=executor) -``` - -NeMo-Run supports launching on several different platforms, including [SLURM clusters](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#slurmexecutor). -For more details, please see the NeMo-Run [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#) for a list of supported platforms, their corresponding executors, and configuration instructions. - -You can also forward arguments from the NeMo-Run launch script to the target script: - -```python -import nemo_run as run -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - ... - known_args, args_to_fwd = parser.parse_known_args() - train_script = run.Script(..., args=args_to_fwd) -``` - -For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b_nemo_run_script.py). - -#### Plugins - -Megatron Bridge provides several NeMo-Run plugins to simplify the usage of certain features. -These plugins can simply be added to the `run.run()` call: - -```python -import nemo_run as run -from megatron.bridge.recipes.run_plugins import NsysPlugin - -if __name__ == "__main__": - train_script = run.Script(path="/path/to/train/script.py", entrypoint="python") - executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") - - plugins = [] # plugins argument expects a list - nsys = NsysPlugin(profile_step_start=10, profile_step_end=15, ...) - plugins.append(nsys) - run.run(train_script, plugins=plugins, executor=executor) -``` - -##### Custom Argument Converters - -By default, plugins convert their configuration to Hydra-style CLI arguments when used with `run.Script` tasks. If your training script uses a different argument format (e.g., argparse), you can provide a custom converter function via the `script_args_converter_fn` parameter. - -```python -import nemo_run as run -from typing import List -from megatron.bridge.recipes.run_plugins import ( - PreemptionPlugin, - PreemptionPluginScriptArgs, -) - -# Define a custom converter for argparse-style arguments -def argparse_preemption_converter(args: PreemptionPluginScriptArgs) -> List[str]: - result = [] - if args.enable_exit_handler: - result.append("--enable-exit-handler") - if args.enable_exit_handler_for_data_loader: - result.append("--enable-exit-handler-dataloader") - return result - -if __name__ == "__main__": - train_script = run.Script(path="/path/to/train/script.py", entrypoint="python") - executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun") - - # Use the plugin with the custom converter - plugin = PreemptionPlugin( - preempt_time=120, - enable_exit_handler=True, - script_args_converter_fn=argparse_preemption_converter, - ) - run.run(train_script, plugins=[plugin], executor=executor) -``` - -Each plugin provides its own corresponding dataclass (e.g., `PreemptionPluginScriptArgs`, `NsysPluginScriptArgs`) that defines the available arguments for conversion. - -See the [API reference](#bridge.recipes.run_plugins) for a list of available NeMo-Run plugins. - -### Avoiding Hangs - -When working with any scripts in Megatron Bridge, please make sure you wrap your code in an `if __name__ == "__main__":` -block. Otherwise, your code may hang unexpectedly. - -The reason for this is that Megatron Bridge uses Python's `multiprocessing` module in the backend when running a -multi-GPU job. The multiprocessing module will create new Python processes that will import the current module (your -script). If you did not add `__name__== "__main__"`, then your module will spawn new processes which import the -module and then each spawn new processes. This results in an infinite loop of process spawning. - -## Resources - -- [OmegaConf documentation](https://omegaconf.readthedocs.io/en/2.3_branch/) -- [torchrun Documentation](https://docs.pytorch.org/docs/stable/elastic/run.html) -- [PyTorch Multinode Training documentation](https://docs.pytorch.org/tutorials/intermediate/ddp_series_multinode.html) -- [NeMo-Run documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html#) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/skills/mlm-bridge-training/SKILL.md -```md ---- -name: mlm-bridge-training -description: Run Megatron-LM (MLM) and Megatron Bridge training with mock or real data. Covers correlation testing, available recipes, and multi-GPU examples. Use when running training, comparing MLM vs Bridge, or translating configs. ---- - -# MLM vs Bridge Training - -For how they differ, the arg mapping tables, gotchas, and translation script, see: - -- `docs/megatron-lm-to-megatron-bridge.md` - -## Correlation Testing - -Use `vanilla_gpt_pretrain_config` for loss-correlation testing. This recipe uses -bare `GPTModelProvider` defaults (LayerNorm, GeLU, learned_absolute position -embeddings, `vocab_size` inherited from tokenizer) — matching MLM -`pretrain_gpt.py` defaults with no args. - -### MLM Correlation Run (2L/256H, 1 GPU) - -```bash -PYTHONPATH=3rdparty/Megatron-LM:$PYTHONPATH \ -uv run python -m torch.distributed.run --nproc_per_node=1 \ - 3rdparty/Megatron-LM/pretrain_gpt.py \ - --num-layers 2 --hidden-size 256 --num-attention-heads 4 \ - --ffn-hidden-size 1024 --seq-length 512 --max-position-embeddings 512 \ - --micro-batch-size 4 --global-batch-size 32 \ - --train-iters 10 --eval-iters 2 --eval-interval 10 \ - --mock-data --bf16 --use-mcore-models \ - --tokenizer-type NullTokenizer --vocab-size 32000 \ - --lr 3e-4 --min-lr 3e-5 --seed 1234 --log-interval 1 -``` - -### Bridge Correlation Run (same config, 1 GPU) - -```bash -rm -rf nemo_experiments && \ -uv run python -m torch.distributed.run --nproc_per_node=1 \ - scripts/training/run_recipe.py \ - --recipe vanilla_gpt_pretrain_config \ - model.num_layers=2 model.hidden_size=256 \ - model.num_attention_heads=4 model.ffn_hidden_size=1024 \ - model.seq_length=512 dataset.sequence_length=512 \ - train.train_iters=10 train.global_batch_size=32 train.micro_batch_size=4 \ - validation.eval_interval=10 validation.eval_iters=2 \ - optimizer.lr=3e-4 optimizer.min_lr=3e-5 \ - scheduler.lr_warmup_iters=1 scheduler.lr_decay_iters=10 \ - rng.seed=1234 logger.log_interval=1 -``` - -### Verification - -With matched parameters the LM losses should be nearly identical at each -iteration. Compare `lm loss` values from both logs — they should agree to -within BF16 rounding. - -## Multi-GPU Examples - -### MLM 2-GPU with TP=2 - -```bash -PYTHONPATH=3rdparty/Megatron-LM:$PYTHONPATH \ -uv run python -m torch.distributed.run --nproc_per_node=2 \ - 3rdparty/Megatron-LM/pretrain_gpt.py \ - --tensor-model-parallel-size 2 --sequence-parallel \ - --num-layers 4 --hidden-size 256 --num-attention-heads 4 \ - --seq-length 1024 --max-position-embeddings 1024 \ - --micro-batch-size 2 --global-batch-size 16 \ - --train-iters 10 --eval-iters 2 --eval-interval 10 \ - --mock-data --bf16 --use-mcore-models \ - --tokenizer-type NullTokenizer --vocab-size 1024 \ - --lr 1e-4 --log-interval 1 -``` - -### Bridge 2-GPU with TP=2 - -```bash -rm -rf nemo_experiments && \ -uv run python -m torch.distributed.run --nproc_per_node=2 \ - scripts/training/run_recipe.py \ - --recipe vanilla_gpt_pretrain_config \ - model.tensor_model_parallel_size=2 model.sequence_parallel=true \ - model.num_layers=4 model.hidden_size=256 \ - model.num_attention_heads=4 model.ffn_hidden_size=1024 \ - model.seq_length=1024 dataset.sequence_length=1024 \ - train.train_iters=10 train.global_batch_size=16 train.micro_batch_size=2 \ - validation.eval_interval=10 validation.eval_iters=2 \ - scheduler.lr_warmup_iters=2 scheduler.lr_decay_iters=10 \ - logger.log_interval=1 -``` - -## Available Recipes - -Common recipes (use with `--recipe`): - -- `vanilla_gpt_pretrain_config` — Minimal GPT (bare GPTModelProvider defaults, - ideal for correlation testing and custom configs) -- `llama32_1b_pretrain_config` — Llama 3.2 1B (16L, 2048H, GBS=512, seq=8192) -- `llama3_8b_pretrain_config` — Llama 3 8B -- `qwen3_8b_pretrain_config` — Qwen3 8B -- `deepseek_v2_lite_pretrain_config` — DeepSeek-V2-Lite 16B MoE - -SFT/PEFT variants use `_sft_config` / `_peft_config` suffix. - -## Megatron-Core Submodule - -For what the submodule is and why two versions exist, see -`docs/megatron-lm-to-megatron-bridge.md`. - -### Check current version - -```bash -./scripts/switch_mcore.sh status -``` - -### Switch to dev for testing newer MCore features - -```bash -./scripts/switch_mcore.sh dev - -# uv sync (without --locked) since lockfile is for main -uv sync -``` - -### Switch back to main - -```bash -./scripts/switch_mcore.sh main -``` - -### After pulling latest main - -When you pull the latest Bridge main branch, the submodule pointer may have -been updated. Re-sync the submodule: - -```bash -git submodule update --init 3rdparty/Megatron-LM -``` - -## Pitfalls - -1. **Always `rm -rf nemo_experiments`** before a fresh correlation run. Bridge - auto-resumes from stale checkpoints silently. - -2. **`uv run` required**: Always use `uv run python -m torch.distributed.run` - (not bare `torchrun` or `python`). - -3. **MLM PYTHONPATH**: Must include `3rdparty/Megatron-LM` so `gpt_builders.py` - is importable. - -4. **Scheduler overrides**: When overriding `train.train_iters` to a small - value, also set `scheduler.lr_warmup_iters` and `scheduler.lr_decay_iters` - or you get an assertion error. - -5. **Use `dataset.sequence_length`** in CLI overrides, not `dataset.seq_length`. - -6. **MoE OOM**: Large MoE models require full activation recomputation and - typically multi-node EP. TP does NOT reduce per-GPU expert memory. - -7. **`uv sync --locked` fails after switching to dev**: The lockfile is generated - against the main MCore commit. Use `uv sync` (without `--locked`) when on dev. - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/sequence-packing/SKILL.md -```md ---- -name: sequence-packing -description: Operational guide for enabling packed sequences and long-context config paths in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification. ---- - -# Sequence Packing Skill - -For stable background and recommendation level, see: - -- `docs/training/packed-sequences.md` -- `card.yaml` (co-located) - -## Enablement - -Offline packed SFT for LLM finetuning: - -```python -from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs - -cfg.train.micro_batch_size = 1 -cfg.dataset.seq_length = 4096 -cfg.model.seq_length = 4096 -cfg.dataset.dataset_kwargs = {"pad_to_max_length": True} -cfg.dataset.packed_sequence_specs = PackedSequenceSpecs( - packed_sequence_size=4096, - pad_seq_to_mult=1, -) -``` - -If CP is enabled: - -```python -cfg.model.context_parallel_size = 2 -cfg.model.calculate_per_token_loss = True -cfg.ddp.average_in_collective = False -cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2 -``` - -If CUDA graphs are enabled for this packed path: - -```python -cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True -cfg.dataset.dataset_kwargs["pad_to_max_length"] = True -``` - -**Note:** `pad_cu_seqlens = True` also requires a metadata JSON file alongside -the packed dataset (asserted in `src/megatron/bridge/data/datasets/sft.py`). -Custom packed datasets that omit the metadata file will hit an assertion at -dataset initialization. - -In-batch packing for VLM finetuning: - -```python -cfg.dataset.pack_sequences_in_batch = True -cfg.train.micro_batch_size = 2 -``` - -Long-context baseline: - -```python -cfg.model.seq_length = 16384 -cfg.dataset.seq_length = 16384 -cfg.model.context_parallel_size = 2 -``` - -## Code Anchors - -LLM packed SFT config surface: - -```72:97:src/megatron/bridge/recipes/utils/finetune_utils.py -if packed_sequence: - dataset_kwargs = {"pad_to_max_length": True} - packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult) -else: - dataset_kwargs = {} - packed_sequence_specs = None -``` - -Bridge validation: - -```1617:1657:src/megatron/bridge/training/config.py -if self.model.context_parallel_size > 1: - assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, ... - if isinstance(self.dataset, FinetuningDatasetConfig): - assert self.model.calculate_per_token_loss, ... - assert not self.ddp.average_in_collective, ... -... -if ... packed_sequence_size > 0 and self.train.micro_batch_size > 1: - raise ValueError(...) -... -if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1: - raise ValueError(...) -``` - -VLM in-batch runtime: - -```308:327:src/megatron/bridge/training/vlm_step.py -if enable_packing: - ... - ) = pack_batch_sequences( - ... - pad_token_id=0, - pad_to_multiple_of=cp_size * 2 if cp_size > 1 else 1, - ) -``` - -Packed THD runtime constraint: - -```61:64:src/megatron/bridge/training/gpt_step.py -if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1: - raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)") -``` - -## Pitfalls - -1. Offline packed SFT and VLM in-batch packing are different features with opposite micro-batch rules. -2. When CP is enabled, packed sequence lengths must respect `2 * context_parallel_size` divisibility. -3. For finetuning with CP, `calculate_per_token_loss=True` and `ddp.average_in_collective=False` are required. -4. `pad_cu_seqlens=True` also requires `pad_to_max_length=True`. -5. Packing support is model-family-specific. `Qwen3-Next`, `GLM-4.5`, and `Qwen3.5-VL` contain explicit opt-outs in different paths. -6. MTP finetuning is documented as incompatible with packed sequences. - -## Verification - -Use the checked-in unit coverage: - -```bash -uv run python -m pytest tests/unit_tests/training/utils/test_packed_seq_utils.py -v && \ -uv run python -m pytest tests/unit_tests/training/test_config.py -k "packed_sequence or pack_sequences_in_batch or context_parallel_seq_length_divisibility or context_parallel_finetuning_validations" -v && \ -uv run python -m pytest tests/unit_tests/training/test_vlm_step.py -k "enable_packing" -v -``` - -Success criteria: - -- first command reports `8 passed` -- second command reports `14 passed` -- third command reports `2 passed` - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/common.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from megatron.core.distributed import DistributedDataParallelConfig - -from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -from megatron.bridge.peft.lora import LoRA -from megatron.bridge.recipes.utils.finetune_utils import default_squad_config -from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing -from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE -from megatron.bridge.training.config import ( - CheckpointConfig, - ConfigContainer, - DistributedInitConfig, - GPTDatasetConfig, - LoggerConfig, - RNGConfig, - TokenizerConfig, - TrainingConfig, - ValidationConfig, -) - - -def _pretrain_common() -> ConfigContainer: - """Create a base pre-training ConfigContainer with common defaults for any language model. - - This function returns a ConfigContainer template with sensible defaults. - The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` before use. - - Returns: - ConfigContainer: Base configuration template for pre-training. - """ - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default optimizer and scheduler - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=500, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=3e-4, - min_lr=3e-5, - ) - - cfg = ConfigContainer( - # Model - MUST be set by each recipe before use - model=None, # type: ignore[arg-type] - # Training config - train=TrainingConfig( - train_iters=300000, - global_batch_size=32, - micro_batch_size=2, - manual_gc=True, - manual_gc_interval=100, - manual_gc_eval=100, - ), - validation=ValidationConfig( - eval_interval=500, - eval_iters=32, - ), - # Optimizer and scheduler - optimizer=opt_cfg, - scheduler=scheduler_cfg, - # DDP config - these are the commonly overridden settings - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - ), - # Dataset config - uses mock data by default - dataset=GPTDatasetConfig( - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - seq_length=4096, - num_dataset_builder_threads=1, - blend=None, # Mock data mode - blend_per_split=None, - split="9999,8,2", - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ), - # Logger config - logger=LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - # Tokenizer - placeholder, each recipe should set tokenizer_model - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=None, # Must be set by each recipe - ), - # Checkpoint config - checkpoint=CheckpointConfig( - save_interval=500, - save=checkpoint_dir, - load=checkpoint_dir, - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - # RNG config - rng=RNGConfig(seed=1234), - # Distributed init config - dist=DistributedInitConfig(), - comm_overlap=None, - # Mixed precision - bf16 by default - mixed_precision="bf16_mixed", - ) - - return cfg - - -def _sft_common() -> ConfigContainer: - """Create a base SFT (Supervised Fine-Tuning) ConfigContainer with common defaults. - - This function returns a ConfigContainer template with sensible defaults for full SFT - (not LoRA/DoRA). The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` - before use. - - Key differences from pre-training: - - Uses HFDatasetConfig with SQuAD as default dataset - - Lower learning rate (5e-6) suitable for full fine-tuning - - Fewer training iterations (1000) - - Smaller batch sizes - - Supports pretrained_checkpoint loading - - No PEFT (full parameter training) - - Returns: - ConfigContainer: Base configuration template for full SFT. - """ - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for SFT - seq_length = 2048 - - # Packed sequence is enabled by default for training efficiency - # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1 - packed_sequence = True - pad_seq_to_mult = 1 # Override in model config if context_parallel_size > 1 - - # Optimizer and scheduler with lower LR for full SFT - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=50, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=5e-6, # Lower LR for full fine-tuning - min_lr=0.0, - adam_beta2=0.98, # Common for fine-tuning - ) - - cfg = ConfigContainer( - # Model - MUST be set by each recipe before use - model=None, # type: ignore[arg-type] - # Training config - shorter training for SFT - train=TrainingConfig( - train_iters=1000, - global_batch_size=128, - micro_batch_size=1, - ), - validation=ValidationConfig( - eval_interval=100, - eval_iters=32, - ), - # Optimizer and scheduler - optimizer=opt_cfg, - scheduler=scheduler_cfg, - # DDP config - minimal settings, model-specific configs can override - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), - # Dataset config - uses SQuAD with packed sequences by default - dataset=default_squad_config( - seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult - ), - # Logger config - logger=LoggerConfig( - log_interval=1, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - # Tokenizer - placeholder, each recipe should set tokenizer_model - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=None, # Must be set by each recipe - ), - # Checkpoint config with pretrained_checkpoint support - checkpoint=CheckpointConfig( - save_interval=100, - save=checkpoint_dir, - load=checkpoint_dir, - pretrained_checkpoint=None, # Set to load from pretrained weights - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - # RNG config - different seed from pretrain - rng=RNGConfig(seed=5678), - # Distributed init config - dist=DistributedInitConfig(), - comm_overlap=None, - # Mixed precision - bf16 by default - mixed_precision="bf16_mixed", - # No PEFT for full SFT - peft=None, - ) - - return cfg - - -def _peft_common() -> ConfigContainer: - """Create a base PEFT (Parameter-Efficient Fine-Tuning) ConfigContainer with LoRA defaults. - - This function returns a ConfigContainer template with sensible defaults for PEFT - using LoRA. The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` - before use. - - Key differences from full SFT: - - Higher learning rate (1e-4) suitable for adapter training - - LoRA enabled by default with standard settings (dim=32, alpha=32) - - Targets all linear layers: linear_qkv, linear_proj, linear_fc1, linear_fc2 - - Returns: - ConfigContainer: Base configuration template for PEFT with LoRA. - """ - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for PEFT - seq_length = 2048 - - # Packed sequence is enabled by default for training efficiency - # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1 - packed_sequence = True - pad_seq_to_mult = 1 # Override in model config if context_parallel_size > 1 - - # Optimizer and scheduler with higher LR for PEFT (only training adapters) - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=50, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=1e-4, # Higher LR for adapter training - min_lr=0.0, - adam_beta2=0.98, # Common for fine-tuning - ) - - cfg = ConfigContainer( - # Model - MUST be set by each recipe before use - model=None, # type: ignore[arg-type] - # Training config - shorter training for PEFT - train=TrainingConfig( - train_iters=1000, - global_batch_size=128, - micro_batch_size=1, - ), - validation=ValidationConfig( - eval_interval=100, - eval_iters=32, - ), - # Optimizer and scheduler - optimizer=opt_cfg, - scheduler=scheduler_cfg, - # DDP config - minimal settings for PEFT - ddp=DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - ), - # Dataset config - uses SQuAD with packed sequences by default - dataset=default_squad_config( - seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult - ), - # Logger config - logger=LoggerConfig( - log_interval=1, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ), - # Tokenizer - placeholder, each recipe should set tokenizer_model - tokenizer=TokenizerConfig( - tokenizer_type="HuggingFaceTokenizer", - tokenizer_model=None, # Must be set by each recipe - ), - # Checkpoint config with pretrained_checkpoint support - checkpoint=CheckpointConfig( - save_interval=100, - save=checkpoint_dir, - load=checkpoint_dir, - pretrained_checkpoint=None, # Set to load from pretrained weights - ckpt_format="torch_dist", - fully_parallel_save=True, - ), - # RNG config - different seed from pretrain - rng=RNGConfig(seed=5678), - # Distributed init config - dist=DistributedInitConfig(), - comm_overlap=None, - # Mixed precision - bf16 by default - mixed_precision="bf16_mixed", - # LoRA config with standard defaults - peft=LoRA( - target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"], - dim=32, - alpha=32, - dropout=0.0, - dropout_position="pre", - lora_A_init_method="xavier", - lora_B_init_method="zero", - a2a_experimental=False, - lora_dtype=None, # Uses model's dtype - ), - ) - - return cfg - - -def _sft_common_vlm() -> ConfigContainer: - """Create a base SFT ConfigContainer with common defaults for Vision-Language Models. - - This function inherits from `_sft_common()` and overrides VLM-specific settings. - The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use. - - Key differences from LLM SFT (`_sft_common`): - - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2) - - Uses NullTokenizer (VLMs use processor instead of tokenizer) - - DDP config optimized for VLM training (no grad/param overlap) - - Supports freeze options for language_model, vision_model, vision_projection - - Different training defaults (train_iters=300000, GBS=32, MBS=2) - - Different RNG seed (1234) - - Returns: - ConfigContainer: Base configuration template for VLM full SFT. - """ - # Start from the LLM SFT common config - cfg = _sft_common() - - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for VLM - seq_length = 4096 - - # VLM-specific training config - longer training with different batch sizes - cfg.train.train_iters = 300000 - cfg.train.global_batch_size = 32 - cfg.train.micro_batch_size = 2 - cfg.train.manual_gc = True - cfg.train.manual_gc_interval = 100 - cfg.train.manual_gc_eval = 100 - - # VLM-specific validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - - # VLM-specific optimizer settings - higher LR for VLM training - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=500, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=3e-4, - min_lr=3e-5, - ) - cfg.optimizer = opt_cfg - cfg.scheduler = scheduler_cfg - - # VLM-specific DDP config - no overlap for VLMs - cfg.ddp = DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - ) - - # VLM-specific dataset - uses HuggingFace dataset provider - # hf_processor_path must be set by model-specific config - cfg.dataset = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=None, # Must be set by model-specific config - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - pack_sequences_in_batch=True, - ) - - # VLM uses NullTokenizer - actual tokenization is handled by the processor - cfg.tokenizer = TokenizerConfig( - tokenizer_type="NullTokenizer", - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE, - ) - - # VLM-specific logger config - cfg.logger = LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ) - - # VLM-specific checkpoint config - cfg.checkpoint.save_interval = 500 - cfg.checkpoint.save = checkpoint_dir - cfg.checkpoint.load = checkpoint_dir - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.fully_parallel_save = True - - # VLM uses different RNG seed - cfg.rng = RNGConfig(seed=1234) - - return cfg - - -def _peft_common_vlm() -> ConfigContainer: - """Create a base PEFT ConfigContainer with LoRA defaults for Vision-Language Models. - - This function inherits from `_peft_common()` and overrides VLM-specific settings. - The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use. - - Key differences from LLM PEFT (`_peft_common`): - - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2) - - Uses NullTokenizer (VLMs use processor instead of tokenizer) - - DDP config optimized for VLM training (no grad/param overlap) - - Supports freeze options for language_model, vision_model, vision_projection - - Different training defaults (train_iters=300000, GBS=32, MBS=2) - - Different RNG seed (1234) - - Higher LR (1e-4) for adapter training - - Returns: - ConfigContainer: Base configuration template for VLM PEFT with LoRA. - """ - # Start from the LLM PEFT common config - cfg = _peft_common() - - # Default output directories - base_output_dir = os.path.join(os.getcwd(), "nemo_experiments") - run_output_dir = os.path.join(base_output_dir, "default") - checkpoint_dir = os.path.join(run_output_dir, "checkpoints") - tensorboard_dir = os.path.join(run_output_dir, "tb_logs") - - # Default sequence length for VLM - seq_length = 4096 - - # VLM-specific training config - longer training with different batch sizes - cfg.train.train_iters = 300000 - cfg.train.global_batch_size = 32 - cfg.train.micro_batch_size = 2 - cfg.train.manual_gc = True - cfg.train.manual_gc_interval = 100 - cfg.train.manual_gc_eval = 100 - - # VLM-specific validation config - cfg.validation.eval_interval = 500 - cfg.validation.eval_iters = 32 - - # VLM-specific optimizer settings - higher LR for PEFT - opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing( - lr_warmup_iters=500, - lr_decay_iters=None, # Defaults to train_iters during validation - max_lr=1e-4, # Higher LR for adapter training - min_lr=1e-5, - ) - cfg.optimizer = opt_cfg - cfg.scheduler = scheduler_cfg - - # VLM-specific DDP config - no overlap for VLMs - cfg.ddp = DistributedDataParallelConfig( - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=False, - overlap_param_gather=False, - average_in_collective=True, - data_parallel_sharding_strategy="optim_grads_params", - use_distributed_optimizer=True, - ) - - # VLM-specific dataset - uses HuggingFace dataset provider - # hf_processor_path must be set by model-specific config - cfg.dataset = HFDatasetConversationProvider( - seq_length=seq_length, - hf_processor_path=None, # Must be set by model-specific config - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - pack_sequences_in_batch=True, - ) - - # VLM uses NullTokenizer - actual tokenization is handled by the processor - cfg.tokenizer = TokenizerConfig( - tokenizer_type="NullTokenizer", - vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE, - ) - - # VLM-specific logger config - cfg.logger = LoggerConfig( - log_interval=10, - tensorboard_dir=tensorboard_dir, - log_timers_to_tensorboard=True, - ) - - # VLM-specific checkpoint config - cfg.checkpoint.save_interval = 500 - cfg.checkpoint.save = checkpoint_dir - cfg.checkpoint.load = checkpoint_dir - cfg.checkpoint.ckpt_format = "torch_dist" - cfg.checkpoint.fully_parallel_save = True - - # VLM uses different RNG seed - cfg.rng = RNGConfig(seed=1234) - - # Keep LoRA config from _peft_common() - it's already set with standard defaults - - return cfg - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/utils/finetune_utils.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utility functions for finetuning recipes.""" - -from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig -from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs -from megatron.bridge.data.hf_processors.gsm8k import process_gsm8k_example -from megatron.bridge.data.hf_processors.openmathinstruct2 import process_openmathinstruct2_example -from megatron.bridge.data.hf_processors.squad import process_squad_example -from megatron.bridge.peft.base import PEFT -from megatron.bridge.peft.dora import DoRA -from megatron.bridge.peft.lora import LoRA - - -def default_peft_config(peft_scheme: str | PEFT | None, **kwargs) -> PEFT | None: - """Create default PEFT configuration matching NeMo2 exactly. - - Args: - peft_scheme: PEFT scheme - 'lora', 'dora', PEFT instance, or None for full finetuning - - Returns: - PEFT configuration or None for full finetuning - """ - if peft_scheme is None: - return None # Full finetuning - - if isinstance(peft_scheme, PEFT): - return peft_scheme # User provided custom PEFT - - if isinstance(peft_scheme, str): - if peft_scheme.lower() == "none": - return None - if peft_scheme.lower() == "lora": - return LoRA(**kwargs) - elif peft_scheme.lower() == "dora": - return DoRA(**kwargs) - else: - raise ValueError(f"Unknown PEFT scheme: {peft_scheme}. Supported: 'lora', 'dora', or None") - - raise ValueError(f"Invalid peft type: {type(peft_scheme)}. Expected str, PEFT instance, or None") - - -def default_squad_config(seq_length: int, packed_sequence: bool = True, pad_seq_to_mult: int = 1) -> HFDatasetConfig: - """Create default SQuAD dataset configuration for finetuning recipes. - - Args: - seq_length: Sequence length for the dataset - packed_sequence: Whether to enable packed sequences for training efficiency - pad_seq_to_mult: Optional multiple to pad each sequence to when packing - (set to `2 * context_parallel_size` for THD CP runs). - - Returns: - HFDatasetConfig configured for SQuAD finetuning - - Note: - Uses consistent settings across all finetuning recipes: - - SQuAD dataset with appropriate dataloader type - - 10% validation split - - Seed 5678 (different from pretrain seed 1234) - - Packed sequences when enabled improve training efficiency - """ - if packed_sequence: - # Packed sequence configuration - dataset_kwargs = {"pad_to_max_length": True} - packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult) - else: - # Standard configuration - dataset_kwargs = {} - packed_sequence_specs = None - - # Use 'batch' sampler for variable-length finetuning - # Samples full global batch to ensure consistent padding across all microbatches - dataloader_type = "batch" - - return HFDatasetConfig( - dataset_name="squad", - process_example_fn=process_squad_example, - seq_length=seq_length, - seed=5678, # Different from pretrain seed - dataloader_type=dataloader_type, - num_workers=1, - do_validation=True, - do_test=False, - val_proportion=0.1, - dataset_kwargs=dataset_kwargs, - packed_sequence_specs=packed_sequence_specs, - rewrite=False, - ) - - -def default_openmathinstruct2_config( - seq_length: int = 4096, - packed_sequence: bool = False, - pad_seq_to_mult: int = 1, -) -> HFDatasetConfig: - """Create default OpenMathInstruct-2 dataset configuration for finetuning recipes.""" - # Create packed sequence specs if needed - packed_sequence_specs = None - if packed_sequence: - packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult) - - return HFDatasetConfig( - dataset_name="nvidia/OpenMathInstruct-2", # Hugging Face dataset name - split="train_1M", # Default to the 1M subset - process_example_fn=process_openmathinstruct2_example, # Processing function - seq_length=seq_length, - seed=5678, - memmap_workers=1, - # Dataloader config parameters - dataloader_type="batch", - do_validation=True, - do_test=False, - val_proportion=0.05, # 950k train, 50k val - num_workers=2, - data_sharding=True, - pin_memory=True, - persistent_workers=False, - packed_sequence_specs=packed_sequence_specs, - rewrite=False, # Rewrite existing processed files - ) - - -def default_gsm8k_config( - seq_length: int = 2048, - packed_sequence: bool = False, - pad_seq_to_mult: int = 1, -) -> HFDatasetConfig: - """Create default GSM8K dataset configuration for finetuning recipes. - - GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse - grade school math word problems. See: https://huggingface.co/datasets/openai/gsm8k - - Args: - seq_length: Sequence length for the dataset (default 2048, sufficient for GSM8K) - packed_sequence: Whether to enable packed sequences for training efficiency - pad_seq_to_mult: Optional multiple to pad each sequence to when packing - (set to `2 * context_parallel_size` for THD CP runs). - - Returns: - HFDatasetConfig configured for GSM8K finetuning - - Note: - - GSM8K has 7,473 train and 1,319 test examples - - Loads the full DatasetDict so the published test split is used for evaluation - - Uses 'batch' dataloader type for variable-length finetuning - """ - # Create packed sequence specs if needed - packed_sequence_specs = None - if packed_sequence: - packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult) - - return HFDatasetConfig( - dataset_name="openai/gsm8k", # Hugging Face dataset name - dataset_subset="main", # 'main' or 'socratic' - process_example_fn=process_gsm8k_example, # Processing function - seq_length=seq_length, - seed=5678, - memmap_workers=1, - # Dataloader config parameters - dataloader_type="batch", - do_validation=False, - do_test=True, - num_workers=2, - data_sharding=True, - pin_memory=True, - persistent_workers=False, - packed_sequence_specs=packed_sequence_specs, - rewrite=False, - ) - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/utils/dataset_utils.py -```py -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Dataset configuration utilities for recipes and training scripts.""" - -import logging -from typing import Callable, List, Optional, Tuple - -from megatron.bridge.data.energon.energon_provider import EnergonProvider -from megatron.bridge.data.loaders import get_blend_and_blend_per_split -from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider -from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider -from megatron.bridge.recipes.utils.finetune_utils import ( - default_gsm8k_config, - default_openmathinstruct2_config, - default_squad_config, -) -from megatron.bridge.training.config import ( - ConfigContainer, - FinetuningDatasetConfig, - GPTDatasetConfig, - MockGPTDatasetConfig, -) - - -logger = logging.getLogger(__name__) - - -_BLEND_TYPE = Optional[Tuple[List[str], Optional[List[float]]]] -_BLEND_PER_SPLIT_TYPE = Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]] -_SPLIT_TYPE = Optional[str] - - -def get_blend_fields_from_data_paths( - data_paths: Optional[List[str]] = None, - data_args_path: Optional[str] = None, - train_data_path: Optional[List[str]] = None, - valid_data_path: Optional[List[str]] = None, - test_data_path: Optional[List[str]] = None, - per_split_data_args_path: Optional[str] = None, - mock: bool = False, -) -> Tuple[_BLEND_TYPE, _BLEND_PER_SPLIT_TYPE, _SPLIT_TYPE]: - """ - Common configuration logic for blend, blend_per_split, split dataset config fields. - - Handles mock and real data. If no path to data is provided, mock data will be used. - Prioritizes `data_paths` over split data paths. For all of `data_paths`, `train_data_path`, - `valid_data_path`, and `test_data_path`, two formats are accepted: either (1) a list of prefixes, - e.g. ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], or (2) a flattened, zipped - list of weights and prefixes, e.g. ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"] - - Args: - data_paths (Optional[List[str]]): List of paths to dataset files. - data_args_path (Optional[str]): Path to file containing data arguments. - train_data_path (Optional[List[str]]): List of training data paths. - valid_data_path (Optional[List[str]]): List of validation data paths. - test_data_path (Optional[List[str]]): List of test data paths. - per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration. - mock (bool): Whether to use mock data. If True, ignores data_paths. - - Returns: - A tuple (blend, blend_per_split, split), the corresponding fields to be passed to GPTDatasetConfig. - """ - has_any_data_config = any( - [data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path] - ) - - if mock or not has_any_data_config: - # Mock data configuration - blend = None # Will trigger mock mode automatically - blend_per_split = None # Will trigger mock mode automatically - split = "1,1,1" # Equal splits for testing - else: - # Real data configuration - blend, blend_per_split = get_blend_and_blend_per_split( - data_paths=data_paths, - data_args_path=data_args_path, - train_data_paths=train_data_path, - valid_data_paths=valid_data_path, - test_data_paths=test_data_path, - per_split_data_args_path=per_split_data_args_path, - ) - - if blend_per_split is not None: - # When using blend_per_split, split should be None - split = None - elif blend is not None: - # When using regular blend, we can use split - split = "9999,8,2" - else: - # No data provided, fall back to mock mode - split = "1,1,1" - - return blend, blend_per_split, split - - -# --------------------------------------------------------------------------- -# Unified dataset type registry -# --------------------------------------------------------------------------- - -DATASET_TYPES = [ - "llm-pretrain", - "llm-pretrain-mock", - "llm-finetune", - "llm-finetune-preloaded", - "vlm-energon", - "vlm-hf", - "vlm-preloaded", -] - -LLM_FINETUNE_PRESETS: dict[str, Callable] = { - "squad": default_squad_config, - "openmathinstruct2": default_openmathinstruct2_config, - "gsm8k": default_gsm8k_config, -} - - -def extract_and_remove_override(cli_overrides: list[str], key: str, default: str | None = None) -> str | None: - """Extract a Hydra-style override (key=value) from *cli_overrides* and remove it. - - Returns the value if found, otherwise *default*. - """ - prefix = f"{key}=" - for i, override in enumerate(cli_overrides): - if override.startswith(prefix): - value = override[len(prefix) :] - cli_overrides.pop(i) - return value - return default - - -def _resolve_seq_length(config: ConfigContainer, seq_length: int | None) -> int: - """Resolve sequence length: explicit arg > model config > 4096 fallback.""" - if seq_length is not None: - return seq_length - if hasattr(config, "model") and config.model is not None and hasattr(config.model, "seq_length"): - return config.model.seq_length - return 4096 - - -def apply_dataset_override( - config: ConfigContainer, - dataset_type: str, - packed_sequence: bool = False, - seq_length: int | None = None, - cli_overrides: list[str] | None = None, -) -> ConfigContainer: - """Replace the recipe's dataset config based on the requested dataset type. - - Args: - config: The recipe config to modify. - dataset_type: One of :data:`DATASET_TYPES`. - packed_sequence: Whether to enable packed sequences. - seq_length: Explicit sequence length (None = use model's or default 4096). - cli_overrides: Mutable list of Hydra-style CLI overrides. For ``llm-finetune``, - ``dataset.dataset_name`` is extracted and consumed here to select the preset. - - Returns: - The modified ConfigContainer. - """ - resolved_seq_length = _resolve_seq_length(config, seq_length) - if cli_overrides is None: - cli_overrides = [] - - if dataset_type == "llm-pretrain": - config.dataset = GPTDatasetConfig( - seq_length=resolved_seq_length, - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - num_dataset_builder_threads=1, - blend=None, - blend_per_split=None, - split="9999,8,2", - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ) - - elif dataset_type == "llm-pretrain-mock": - config.dataset = MockGPTDatasetConfig( - seq_length=resolved_seq_length, - random_seed=1234, - reset_attention_mask=False, - reset_position_ids=False, - eod_mask_loss=False, - num_dataset_builder_threads=1, - split="9999,8,2", - data_sharding=True, - dataloader_type="single", - skip_getting_attention_mask_from_dataset=True, - ) - - elif dataset_type == "llm-finetune": - preset_name = extract_and_remove_override(cli_overrides, "dataset.dataset_name", default="squad") - if preset_name not in LLM_FINETUNE_PRESETS: - raise ValueError( - f"Unknown finetune dataset preset: '{preset_name}'. " - f"Choose from: {', '.join(sorted(LLM_FINETUNE_PRESETS.keys()))}" - ) - factory = LLM_FINETUNE_PRESETS[preset_name] - kwargs: dict = {"packed_sequence": packed_sequence, "pad_seq_to_mult": 1} - kwargs["seq_length"] = resolved_seq_length - config.dataset = factory(**kwargs) - - elif dataset_type == "llm-finetune-preloaded": - config.dataset = FinetuningDatasetConfig( - seq_length=resolved_seq_length, - dataset_root=None, - dataloader_type="batch", - seed=5678, - ) - - elif dataset_type == "vlm-energon": - if isinstance(config.dataset, EnergonProvider): - logger.info("Recipe already provides EnergonProvider; keeping it (preserves task_encoder).") - else: - logger.warning( - "Creating bare EnergonProvider. task_encoder and image_processor are unset; " - "use a recipe that provides them or set via code." - ) - config.dataset = EnergonProvider( - path="", - seq_length=resolved_seq_length, - micro_batch_size=config.train.micro_batch_size, - global_batch_size=config.train.global_batch_size, - num_workers=2, - ) - - elif dataset_type == "vlm-hf": - config.dataset = HFDatasetConversationProvider( - seq_length=resolved_seq_length, - hf_processor_path=None, - maker_name="make_cord_v2_dataset", - num_workers=2, - dataloader_type="single", - data_sharding=True, - pin_memory=True, - persistent_workers=False, - pack_sequences_in_batch=False, - ) - - elif dataset_type == "vlm-preloaded": - config.dataset = PreloadedVLMConversationProvider( - seq_length=resolved_seq_length, - hf_processor_path=None, - train_data_path=None, - valid_data_path=None, - test_data_path=None, - dataloader_type="single", - num_workers=2, - ) - - else: - raise ValueError(f"Unknown dataset type: '{dataset_type}'. Choose from: {', '.join(DATASET_TYPES)}") - - if seq_length is not None and hasattr(config, "model") and config.model is not None: - config.model.seq_length = seq_length - - return config - - -def infer_mode_from_dataset(dataset_type: str) -> str: - """Infer training mode from the dataset type prefix.""" - if dataset_type.startswith("llm-pretrain"): - return "pretrain" - return "finetune" - -``` - -File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/datasets/sft.py -```py -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import math -import os -import re -from pathlib import Path -from typing import Mapping - -import datasets -import numpy as np -import torch -from datasets import load_dataset -from megatron.core.msc_utils import MultiStorageClientFeature -from torch.utils.data import Dataset - -from megatron.bridge.data.datasets.utils import ( - _chat_preprocess, - _get_samples_mapping, - _JSONLMemMapDataset, - _OnlineSampleMapping, - _preprocess, - _tokenize, -) -from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer - - -DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo" -NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME)) -DEFAULT_NEMO_DATASETS_CACHE = NEMO_CACHE_HOME / "datasets" -NEMO_DATASETS_CACHE = Path(os.getenv("NEMO_DATASETS_CACHE", DEFAULT_NEMO_DATASETS_CACHE)) -DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models" -NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE)) - -if os.getenv("TOKENIZERS_PARALLELISM") is None: - os.putenv("TOKENIZERS_PARALLELISM", "True") - -logger = logging.getLogger(__name__) - -# hack to avoid the "not enough disk space" error in some slurm cluster -datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True - -PREFIX_STR = ( - "\x00" # the prefix string used in the tokenizer to deal with the added empty token for some of the tokenizers -) - -__idx_version__ = "0.2" # index file version -__idx_suffix__ = "idx" # index file suffix - - -def get_dataset_root(name: str) -> Path: - """ - Returns the root directory for NeMo datasets, creating it if it doesn't exist. - - Args: - name (str): The name of the dataset, used to create a subdirectory within the NeMo datasets cache. - - Returns: - Path: The path to the dataset's root directory. - """ - output = Path(NEMO_DATASETS_CACHE) / name - output.mkdir(parents=True, exist_ok=True) - - return output - - -def create_sft_dataset( - path: str | Path, - tokenizer: "MegatronTokenizer", - seq_length: int = 2048, - add_bos: bool = False, - add_eos: bool = True, - add_sep: bool = False, - seed: int = 1234, - label_key: str = "output", - answer_only_loss: bool = True, - truncation_field: str = "input", - pad_to_max_length: bool = False, - index_mapping_dir: str | None = None, - prompt_template: str = "{input} {output}", - truncation_method: str = "right", - memmap_workers: int = 2, - hf_dataset: bool = False, - global_sample_mapping: bool = False, - get_attention_mask_from_fusion: bool = True, - pack_metadata_file_path: Path | str | None = None, - pad_cu_seqlens: bool = False, - pad_seq_to_mult: int = 1, - chat: bool = False, - use_hf_tokenizer_chat_template: bool = False, - tool_schemas: str | dict | None = None, - **kwargs, -) -> "GPTSFTDataset": - """ - Creates and returns a supervised fine-tuning (SFT) dataset instance. - - This function acts as a factory for different types of SFT datasets based on the - input parameters. It can create standard SFT datasets, chat-specific datasets, - or packed sequence datasets. - - Dataset selection logic: - 1. If path ends with .npy: GPTSFTPackedDataset (legacy packed format) - 2. If path is a packed parquet spec (file/dir/glob ending in .parquet/.pq, - or a directory): GPTSFTPackedParquetDataset - - Note: Selection is based on path pattern, not pack_metadata_file_path - - Schema validation (REQUIRED_COLUMNS) will fast-fail for non-packed files - 3. If chat=True: GPTSFTChatDataset - 4. Otherwise: GPTSFTDataset - - Args: - path (str | Path): Path to the dataset file or packed parquet spec (file/dir/glob). - For packed datasets, this can be a .npy file, a .parquet file, a directory - containing parquet files, or a glob pattern. - tokenizer (MegatronTokenizer): The tokenizer to use for tokenizing the data. - seq_length (int, optional): Maximum sequence length for each example. Defaults to 2048. - add_bos (bool, optional): Whether to add a beginning-of-sentence token. Defaults to False. - add_eos (bool, optional): Whether to add an end-of-sentence token. Defaults to True. - add_sep (bool, optional): Whether to add a separation token between prompt and answer. Defaults to False. - seed (int, optional): Random seed for data shuffling. Defaults to 1234. - label_key (str, optional): The key in the dataset corresponding to the label/output. Defaults to "output". - answer_only_loss (bool, optional): If True, compute loss only on the answer part. Defaults to True. - truncation_field (str, optional): Field(s) to truncate if the combined length exceeds `seq_length`. - Comma-separated if multiple. Defaults to "input". - pad_to_max_length (bool, optional): Whether to pad all samples to `max_seq_length`. Defaults to False. - index_mapping_dir (str | None, optional): Directory to store/load index mapping files. Defaults to None. - prompt_template (str, optional): F-string template for combining input fields. - Example: "{input} {output}". Defaults to "{input} {output}". - truncation_method (str, optional): Method for truncation ('left' or 'right'). Defaults to "right". - memmap_workers (int, optional): Number of workers for memory-mapped dataset loading. Defaults to 2. - hf_dataset (bool, optional): Whether to load the dataset using HuggingFace's `datasets` library. - Defaults to False. - global_sample_mapping (bool, optional): Whether to use a global sample mapping for shuffling across all data, - or shuffle within each epoch. Defaults to False. - get_attention_mask_from_fusion (bool): if true, lets attention kernel handle creation of causal mask instead - of adding it to the batch dict. - pack_metadata_file_path (Path | str | None, optional): Path to the metadata file for packed datasets. - When provided, enables packed mode. Required if `pad_cu_seqlens` is True. Defaults to None. - pad_cu_seqlens (bool, optional): Whether to pad `cu_seqlens` for packed datasets, - required for cudagraphs. Defaults to False. - chat (bool, optional): If True, creates a `GPTSFTChatDataset`. Defaults to False. - use_hf_tokenizer_chat_template (bool, optional): If True, uses HuggingFace tokenizer's chat template - via `apply_chat_template` method. Only applies when `chat=True`. Defaults to False. - tool_schemas (str | dict | None, optional): Tool schemas for function calling support. - Can be a JSON string or a dict. Only applies when `chat=True` and - `use_hf_tokenizer_chat_template=True`. Defaults to None. - **kwargs: Additional keyword arguments passed to the specific dataset class constructor. - - Returns: - GPTSFTDataset | GPTSFTChatDataset | GPTSFTPackedDataset: An instance of the appropriate SFT dataset class. - """ - # Normalize path to string for consistent handling - path_str = str(path) - - gpt_sft_dataset_kwargs = { - "file_path": path_str, - "tokenizer": tokenizer, - "max_seq_length": seq_length, - "memmap_workers": memmap_workers, - "hf_dataset": hf_dataset, - "global_sample_mapping": global_sample_mapping, - "add_bos": add_bos, - "add_eos": add_eos, - "add_sep": add_sep, - "seed": seed, - "label_key": label_key, - "answer_only_loss": answer_only_loss, - "truncation_field": truncation_field, - "pad_to_max_length": pad_to_max_length, - "index_mapping_dir": index_mapping_dir, - "prompt_template": prompt_template, - "truncation_method": truncation_method, - "get_attention_mask_from_fusion": get_attention_mask_from_fusion, - } - - # Check for .npy packed dataset (legacy format) - if path_str.lower().endswith(".npy"): - return GPTSFTPackedDataset( - pack_metadata_file_path=pack_metadata_file_path, - pad_cu_seqlens=pad_cu_seqlens, - pad_seq_to_mult=pad_seq_to_mult, - **gpt_sft_dataset_kwargs, - **kwargs, - ) - - # Lazy import to avoid circular dependency (packed_parquet imports from sft) - from megatron.bridge.data.datasets.packed_parquet import ( - GPTSFTPackedParquetDataset, - is_packed_parquet_spec, - ) - - # Select GPTSFTPackedParquetDataset for any packed parquet spec (file/dir/glob) - # This is determined purely by path pattern, NOT by pack_metadata_file_path. - # Rationale: - # - Directory/glob specs clearly indicate packed parquet shards - # - Schema validation (REQUIRED_COLUMNS) will fast-fail if files aren't packed format - # - This allows externally-prepared packed data to work without requiring MB metadata - if is_packed_parquet_spec(path_str): - return GPTSFTPackedParquetDataset( - pack_metadata_file_path=pack_metadata_file_path, - pad_cu_seqlens=pad_cu_seqlens, - **gpt_sft_dataset_kwargs, - **kwargs, - ) - elif chat: - return GPTSFTChatDataset( - **gpt_sft_dataset_kwargs, - use_hf_tokenizer_chat_template=use_hf_tokenizer_chat_template, - tool_schemas=tool_schemas, - **kwargs, - ) - else: - return GPTSFTDataset( - **gpt_sft_dataset_kwargs, - **kwargs, - ) - - -class GPTSFTDataset(Dataset): - """ """ - - def __init__( - self, - file_path: str, - tokenizer: MegatronTokenizer, - max_seq_length: int = 1024, - min_seq_length: int = 1, - pad_seq_length_to_mult: int = 16, - add_bos: bool = False, - add_eos: bool = True, - add_sep: bool = False, - sep_id: int = None, - max_num_samples: int = None, - seed: int = 1234, - label_key: str = "answer", - answer_only_loss: bool = True, - truncation_field: str = "text", - pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings. - index_mapping_dir: str = None, - prompt_template: str = None, - virtual_tokens: int = 0, - tokens_to_generate: int = 0, - memmap_workers: int | None = None, - hf_dataset: bool = False, - global_sample_mapping: bool = False, - truncation_method: str = "right", - special_tokens: Mapping[str, str] | None = None, # special tokens, a dictory of {token_type: token} - is_test: bool = False, - output_original_text: bool = False, - ceil_to_power_2: bool = False, - get_attention_mask_from_fusion: bool = True, - ): - """ - file_path: Path to a JSONL GPT supervised fine-tuning dataset. - Data is formatted as multiple JSON lines with each line formatted as follows: - { - 'input': 'John von Neumann\nVon Neumann made fundamental contributions ... - Q: What did the math of artificial viscosity do?', - 'output': 'smoothed the shock transition without sacrificing basic physics' - } - tokenizer: Tokenizer for the dataset. Instance of a class that inherits MegatronTokenizer (ex: SentencePiece). - max_seq_length (int): maximum sequence length for each dataset examples. - Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. - Data examples will be dropped if they do not meet the min length requirements. - add_bos (bool): Whether to add a beginning of sentence token to each data example - add_eos (bool): Whether to add an end of sentence token to each data example - add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer) - tokens_to_generate (int): (inference only) Number of tokens to generate during inference - seed: Random seed for data shuffling. - max_num_samples: Maximum number of samples to load. - This can be > dataset length if you want to oversample data. If None, all samples will be loaded. - label_key: Key to use for the label in your JSONL file - answer_only_loss: If True, will compute the loss only on the answer part of the input. - If False, will compute the loss on the entire input. - truncation_field: Field to use for truncation. (Options: keys in prompt_template). - Field to be used for truncation if the combined length exceeds the max sequence length. - pad_to_max_length: Whether to pad the input to the max sequence length. - If False, will pad to the max length of the current batch. - index_mapping_dir: Directory to save the index mapping to. - If None, will write to the same folder as the dataset. - prompt_template: Prompt template to inject via an fstring. - Formatted like Q: {context_key}\n\nA: {label_key} - hf_dataset: Whether to load the json file with the HuggingFace dataset. - Otherwise, will load the jsonl file with the JSONLMemMapDataset. - global_sample_mapping: Whether to shuffle all data together, or shuffle the dataset within each epoch - truncation_method: Truncation from which position. Options: ['left', 'right'] - special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}. - Default: { - 'system_turn_start': '', - 'turn_start': '', - 'label_start': '', - 'end_of_turn': '\n', - 'end_of_name': '\n' - } - is_test: Whether this dataset is the test split. - output_original_text (bool): if true, will keep the original text in the output alongside the tokenized ids. - get_attention_mask_from_fusion (bool): if true, lets attention kernel handle creation of causal mask instead - of adding it to the batch dict. - """ - self.tokenizer = tokenizer - self.file_path = file_path - self.max_seq_length = max_seq_length - self.min_seq_length = min_seq_length - self.pad_seq_length_to_mult = pad_seq_length_to_mult - self.add_bos = add_bos - self.add_eos = add_eos - self.add_sep = add_sep - self.sep_id = sep_id - self.max_num_samples = max_num_samples - self.seed = seed - self.label_key = label_key - self.answer_only_loss = answer_only_loss - self.truncation_fields = truncation_field.split(",") if truncation_field is not None else [] - self.pad_to_max_length = pad_to_max_length - self.index_mapping_dir = index_mapping_dir - self.prompt_template = prompt_template - self.virtual_tokens = virtual_tokens - self.tokens_to_generate = tokens_to_generate - self.memmap_workers = memmap_workers - self.hf_dataset = hf_dataset - self.global_sample_mapping = global_sample_mapping - self.truncation_method = truncation_method - self.is_test = is_test - self.output_original_text = output_original_text - self.ceil_to_power_2 = ceil_to_power_2 - self.get_attention_mask_from_fusion = get_attention_mask_from_fusion - - if special_tokens is None: - self.special_tokens = { - "system_turn_start": "", - "turn_start": "", - "label_start": "", - "end_of_turn": "\n", - "end_of_name": "\n", - } - else: - self.special_tokens = special_tokens - - self._load_dataset() - - # Validate prompt template - self._maybe_validate_prompt_template() - - # Will be None after this call if `max_num_samples` is None - self._build_samples_mapping() - - def _load_dataset(self): - if self.hf_dataset: - self.indexed_dataset = load_dataset( - "json", - data_files=self.file_path, - cache_dir=self.index_mapping_dir, - num_proc=self.memmap_workers, - split="train", - ) - else: - self.indexed_dataset = _JSONLMemMapDataset( - dataset_paths=[self.file_path], - tokenizer=None, - header_lines=0, - index_mapping_dir=self.index_mapping_dir, - workers=self.memmap_workers, - ) - - def _maybe_validate_prompt_template(self): - assert self.prompt_template is not None, ( - f"we need prompt_template to combine contexts and label {self.label_key}" - ) - # When providing things like newlines in the prompt template via the CLI, they are escaped. - # This line unescapes them. - self.prompt_template = self.prompt_template.encode("utf-8").decode("unicode_escape") - self.prompt_template_keys = re.findall(r"{(.*?)}", self.prompt_template) - - label_placeholder = f"{{{self.label_key}}}" - assert self.prompt_template[-len(label_placeholder) :] == label_placeholder, ( - f"{label_placeholder} must be at the end of prompt_template." - ) - - # Legacy checkpoints has self.truncation_fields = ['context'] - # and self.prompt_template_keys = ['input', 'output'] - if len(self.truncation_fields) > 0: - if self.prompt_template_keys[0] == "input" and self.truncation_fields[0] == "context": - self.truncation_fields[0] = self.prompt_template_keys[0] - - assert set(self.truncation_fields).issubset(self.prompt_template_keys), ( - f"truncation_fields {self.truncation_fields} must in {self.prompt_template_keys}" - ) - - def _build_samples_mapping(self): - if self.max_num_samples is not None: - osm = ( - _OnlineSampleMapping(dataset_size=len(self.indexed_dataset), num_samples=self.max_num_samples) - if not self.global_sample_mapping - else None - ) - self.samples_mapping = _get_samples_mapping( - indexed_dataset=self.indexed_dataset, - data_prefix=self.file_path, - num_epochs=None, - max_num_samples=self.max_num_samples, - max_seq_length=self.max_seq_length - 2, - short_seq_prob=0, - seed=self.seed, - name=self.file_path.split("/")[-1], - binary_head=False, - index_mapping_dir=self.index_mapping_dir, - samples_mapping=osm, - ) - else: - self.samples_mapping = None - - def __len__(self): - """Return the total number of samples in this dataset.""" - if self.max_num_samples is None: - return len(self.indexed_dataset) - else: - return len(self.samples_mapping) - - def __getitem__(self, idx): - if isinstance(idx, np.int64): - idx = idx.item() - - if self.samples_mapping is not None: - assert idx < len(self.samples_mapping) - idx, _, _ = self.samples_mapping[idx] - if isinstance(idx, (np.uint32, np.int64)): - idx = idx.item() - - assert idx < len(self.indexed_dataset) - # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1 - if idx < 0: - idx = len(self) + idx - auto_gen_idx = True - else: - auto_gen_idx = False - try: - example = self.indexed_dataset[idx] - if auto_gen_idx: - example["__AUTOGENERATED__"] = True - except Exception as e: - logger.error(f"Error while loading example {idx} from dataset {self.file_path}") - raise e - return self._process_example(example) - - def _separate_template(self, prompt_template_values: list[str]): - """ - Combine contexts and label based on prompt_template into a list of strings and a list of keys. - - Args: - prompt_template_values (list[str]): the list of context and label strings - extrated from jsonl file with prompt_template_keys. - - Returns: - template_strings (list[str]): separated prompt_template with contexts/label - placeholder filled with corresponding strings - template_strings_keys (list[str]): strings point to placeholder keys or