diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index f5385dc3e..e976f049f 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -4,24 +4,24 @@
     "name": "NVIDIA Nemotron Team"
   },
   "metadata": {
-    "description": "NVIDIA Nemotron AI stack plugins — pipeline builder, model knowledge bases, and contributor tools"
+    "description": "NVIDIA Nemotron AI stack plugins"
   },
   "plugins": [
     {
-      "name": "nemotron",
-      "source": "./plugins/nemotron",
-      "description": "NVIDIA Nemotron AI stack — pipeline builder and model knowledge bases",
-      "version": "0.3.0",
+      "name": "nemotron-customize",
+      "source": "./skills/nemotron-customize",
+      "description": "Compose runnable Nemotron model-customization pipelines from repo steps.",
+      "version": "0.1.0",
       "category": "ml-pipelines",
-      "keywords": ["nvidia", "nemotron", "training", "sft", "rl", "megatron", "models"]
-    },
-    {
-      "name": "nemotron-dev",
-      "source": "./plugins/nemotron-dev",
-      "description": "Internal: contributor tools for Nemotron repo developers",
-      "version": "0.3.0",
-      "category": "developer-tools",
-      "keywords": ["nvidia", "nemotron", "internal", "contributing", "dev"]
+      "keywords": [
+        "nvidia",
+        "nemotron",
+        "training",
+        "sft",
+        "rl",
+        "megatron",
+        "customization"
+      ]
     }
   ]
 }
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
deleted file mode 100644
index 46b16e537..000000000
--- a/.claude-plugin/plugin.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "name": "nemotron-customize",
-  "description": "Compose custom ML training pipelines from the NVIDIA AI stack",
-  "version": "0.1.0",
-  "author": {
-    "name": "NVIDIA Nemotron Team"
-  },
-  "skills": [
-    "./skills/"
-  ]
-}
diff --git a/.gitignore b/.gitignore
index 130bd92b1..251fea2f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,6 +105,7 @@ CLAUDE.md
 # Compiled config
 config.yaml
 main.py
+src/nemotron/steps/_bootstrap/runtime/
 
 # Documentation build
 docs/_build/
diff --git a/README.md b/README.md
index 43da7e9fe..027c22817 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,36 @@
 
 ---
 
+## Use from Claude Code
+
+This repo ships a Claude Code plugin called **`nemotron-customize`** that turns the step catalog under [`src/nemotron/steps/`](./src/nemotron/steps/) into a guided, repo-native pipeline builder.
+
+Install once:
+
+```text
+/plugin marketplace add NVIDIA/Nemotron
+/plugin install nemotron-customize@nvidia-nemotron
+```
+
+Then, **start Claude Code from the repo root** and invoke the skill:
+
+```bash
+cd /path/to/Nemotron        # repo root: must contain pyproject.toml and src/nemotron/steps/
+claude
+```
+
+```text
+/nemotron-customize
+```
+
+The skill resolves all file paths against your current working directory, so it must be invoked from the Nemotron checkout root. Running it from a subdirectory will cause file reads to fail.
+
+The skill plans the step DAG, validates artifact wiring, and emits the YAML configs needed to run the requested pipeline. See [`skills/nemotron-customize/SKILL.md`](./skills/nemotron-customize/SKILL.md) for the full contract.
+
+> The marketplace installs **only** `nemotron-customize`. The other folders under [`skills/`](./skills/) (model knowledge bases, contributor add-`*` skills) stay on disk for repo browsing but are not loaded as plugins.
+
+---
+
 ## Repository Overview
 
 ```
diff --git a/deploy/nemotron-customizer/airgap/.gitignore b/deploy/nemotron-customizer/airgap/.gitignore
new file mode 100644
index 000000000..6ccaadce0
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/.gitignore
@@ -0,0 +1,7 @@
+# Generated by airgap runner.
+out/
+airgap-bundle/
+archives/
+__pycache__/
+*.lock.yaml
+*.tar
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.execution b/deploy/nemotron-customizer/airgap/Dockerfile.execution
new file mode 100644
index 000000000..acc9fb7bd
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/Dockerfile.execution
@@ -0,0 +1,52 @@
+# Derivative execution image for Nemotron Customizer airgap.
+# Built from the real training/runtime image and only adds small missing
+# wrapper packages.
+
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+
+ARG EXECUTION_REQUIREMENTS
+ARG REPO_OVERLAYS
+ARG REPO_OVERLAYS_DIR
+ARG PYTHON_BIN=python
+ARG PIP_NO_DEPS=true
+
+ENV HF_HUB_OFFLINE=1
+ENV TRANSFORMERS_OFFLINE=1
+ENV HF_DATASETS_OFFLINE=1
+ENV WANDB_MODE=offline
+
+COPY ${EXECUTION_REQUIREMENTS} /opt/nemotron-airgap/execution-requirements.txt
+COPY ${REPO_OVERLAYS} /opt/nemotron-airgap/repo-overlays.json
+COPY ${REPO_OVERLAYS_DIR}/ /opt/nemotron-airgap/repo-overlays/
+
+# Build-time installs keep --no-cache-dir so derivative image layers stay small.
+RUN if [ -s /opt/nemotron-airgap/execution-requirements.txt ]; then \
+      if [ "${PIP_NO_DEPS}" = "true" ]; then \
+        ${PYTHON_BIN} -m pip install --no-cache-dir --no-deps -r /opt/nemotron-airgap/execution-requirements.txt; \
+      else \
+        ${PYTHON_BIN} -m pip install --no-cache-dir -r /opt/nemotron-airgap/execution-requirements.txt; \
+      fi; \
+    fi && \
+    ${PYTHON_BIN} - <<'PY'
+import json
+import pathlib
+import shutil
+
+root = pathlib.Path("/opt/nemotron-airgap/repo-overlays")
+items = json.loads(pathlib.Path("/opt/nemotron-airgap/repo-overlays.json").read_text())
+for item in items:
+    repo = item["repo"]
+    source = item.get("source", repo)
+    target = pathlib.Path(item["target"])
+    src = root / source
+    if not src.exists():
+        raise SystemExit(f"missing baked repo overlay: {src}")
+    if target.exists() or target.is_symlink():
+        if target.is_dir() and not target.is_symlink():
+            shutil.rmtree(target)
+        else:
+            target.unlink()
+    target.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copytree(src, target)
+PY
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore
new file mode 100644
index 000000000..9ec7d6457
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/Dockerfile.execution.dockerignore
@@ -0,0 +1,14 @@
+**
+
+!deploy
+!deploy/nemotron-customizer
+!deploy/nemotron-customizer/airgap
+!deploy/nemotron-customizer/airgap/out
+!deploy/nemotron-customizer/airgap/out/execution-context
+!deploy/nemotron-customizer/airgap/out/execution-context/**
+!deploy/nemotron-customizer/airgap/out/repo-overlays
+!deploy/nemotron-customizer/airgap/out/repo-overlays/**
+
+**/.git
+**/__pycache__
+**/*.pyc
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher b/deploy/nemotron-customizer/airgap/Dockerfile.launcher
new file mode 100644
index 000000000..7d26315d5
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/Dockerfile.launcher
@@ -0,0 +1,30 @@
+# Launcher image for Nemotron Customizer airgap.
+# It contains the repo and a uv-synced environment. It does not run training.
+
+ARG BASE_IMAGE=python:3.12-slim
+FROM ${BASE_IMAGE}
+
+ARG UV_VERSION=0.11.1
+
+WORKDIR /workspace/Nemotron
+
+ENV UV_LINK_MODE=copy
+ENV UV_PYTHON_DOWNLOADS=never
+ENV HF_HUB_OFFLINE=1
+ENV TRANSFORMERS_OFFLINE=1
+ENV HF_DATASETS_OFFLINE=1
+ENV WANDB_MODE=offline
+ENV PYTHONPATH=/workspace/Nemotron/src
+ENV PATH=/workspace/Nemotron/.venv/bin:$PATH
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python -m pip install --no-cache-dir "uv==${UV_VERSION}"
+
+COPY . .
+
+RUN uv sync --frozen --no-dev
+
+CMD ["bash"]
diff --git a/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore b/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore
new file mode 100644
index 000000000..6cecc5520
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/Dockerfile.launcher.dockerignore
@@ -0,0 +1,21 @@
+.git
+.venv
+.ruff_cache
+.pytest_cache
+**/__pycache__
+**/*.pyc
+
+/.nemo_run
+/outputs
+/output
+/logs
+/checkpoints
+/wandb
+/data
+/downloads
+
+deploy/nemotron-customizer/airgap/out
+deploy/nemotron-customizer/airgap/airgap-bundle
+deploy/nemotron-customizer/airgap/archives
+deploy/nemotron-customizer/airgap/*.tar
+deploy/nemotron-customizer/airgap/*.lock.yaml
diff --git a/deploy/nemotron-customizer/airgap/README.md b/deploy/nemotron-customizer/airgap/README.md
new file mode 100644
index 000000000..718135790
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/README.md
@@ -0,0 +1,135 @@
+# Nemotron Customizer Airgap
+
+This folder is scoped only to Nemotron Customizer steps under
+`src/nemotron/steps/`.
+
+The flow is intentionally small:
+
+1. Build one **launcher image** with this repo and `uv.lock`.
+2. Build one or more **execution images** by grouping selected workflow stages by base image.
+3. Save those images as tarballs for the airgapped side.
+4. Keep models, datasets, checkpoints, and customer files on persistent storage.
+
+Edit `airgap.yaml` first:
+
+- `workflow.stages`: the Nemotron Customizer steps the customer wants to run
+- `dependencies`: central step dependency map, for example SFT training needs SFT packing
+- `step_execution_images`: which execution image each step should use
+- `execution_images`: the base image, output tag, and known/import-probed Python requirements
+
+Only steps reached from `workflow.stages` are built. Steps are grouped by
+`base_image + repo_overlays`; each group gets one derivative image with the
+union of its small missing packages. If two selected step families share the
+same base image and repo overlays, the runner emits one combined execution image for
+both.
+
+Run from the repo root:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml
+```
+
+That prints the plan. To actually pull/build/save images on the connected
+machine:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --execute
+```
+
+To run only a few stages:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --stage validate \
+  --stage discover-execution-deps
+```
+
+To override the workflow without editing YAML, pass one or more selected
+Nemotron step targets. Dependencies are still expanded from `dependencies`.
+For example, SDG plus SFT also adds `data_prep/sft_packing` because SFT needs packed
+data:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --target sdg/data_designer:tiny \
+  --target sft/megatron_bridge:tiny
+```
+
+Outputs are written under `deploy/nemotron-customizer/airgap/out/` by default:
+
+- `airgap-manifest.yaml`: what was validated and built
+- `airgap-build-state.yaml`: incomplete execute run state used for resume
+- `airgap-build-complete.yaml`: final execute run state after success
+- `requirements-<execution-group>.txt`: small missing packages per execution image
+- `repo-overlays-<execution-group>.json`: git auto-mounts discovered from selected step configs
+- `launcher-image.tar`
+- `execution-*.tar`
+- SHA256 checksums for saved image tarballs in `airgap-manifest.yaml`
+
+If an execute run fails midway, leave `airgap-build-state.yaml` in place and rerun
+the same command. Completed expensive actions are reused when their artifacts
+still exist. If you intentionally change the workflow or image plan before
+finishing, move or remove `airgap-build-state.yaml` first; the runner will not
+silently overwrite incomplete state from a different plan.
+
+Runtime dependency probes use Docker volumes named
+`nemotron-airgap-pip-cache-<platform>` to avoid downloading the same wheels on
+every probe loop. To reset them, run `docker volume ls | grep
+nemotron-airgap-pip-cache` and remove the relevant volume with
+`docker volume rm`.
+
+Large assets are not baked into images. The customer should stage them on
+executor-visible persistent storage and reference them through config overrides
+and `run.env.mounts`.
+
+During dependency discovery, the runner mounts the connected-machine checkout
+into each execution image only to probe imports. The final execution image deliberately
+does not bake this repo; the launcher image and the normal nemo-run/nemo-runspec
+code transport provide the repo to the remote job at submission time.
+
+Repo logistics stay outside `airgap.yaml`. If a selected step config contains
+`${auto_mount:git+...}`, the runner treats it as a connected-machine build input:
+it fetches that pinned repo and bakes it into the derivative execution image at the
+requested target path. Runtime jobs then use the baked image and do not clone
+from GitHub. Site-specific data/model mounts remain in env profiles or step
+overrides.
+
+If the connected machine is not the same architecture as the target cluster,
+set `platform: linux/amd64` on the `launcher_image` or execution image entry in
+`airgap.yaml`. If you need to minimize transfer size for several images that
+share layers, `docker save -o all-images.tar tag1 tag2 ...` can be used after
+the runner builds the images; a single tar deduplicates shared layers better
+than one tar per image.
+
+The Dockerfiles expect the chosen base images to have Python and `pip` available
+for bootstrapping small offline additions. The runtime defaults bake
+`HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`, and
+`WANDB_MODE=offline`; customers with an internal mirror can override those at
+submission time through their env profile or `run.env.env_vars`.
+
+For SFT Megatron-Bridge, build with the normal config so the runner can discover
+the pinned Megatron-LM and Megatron-Bridge auto-mounts:
+
+```yaml
+workflow:
+  stages:
+    - sft/megatron_bridge:tiny
+```
+
+When submitting inside the airgap, use the deploy overlay config so those git
+auto-mounts are cleared at runtime while persistent storage mounts from the env
+profile still apply. Use the image printed by the runner under
+`selected execution images`, or read it from `out/airgap-manifest.yaml` under
+`step_execution_images`.
+
+```bash
+uv run nemotron steps run sft/megatron_bridge \
+  -c deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml \
+  -b <your-airgap-profile> \
+  run.env.container_image=<image-printed-for-sft/megatron_bridge>
+```
diff --git a/deploy/nemotron-customizer/airgap/SKILL.md b/deploy/nemotron-customizer/airgap/SKILL.md
new file mode 100644
index 000000000..20a0d0798
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/SKILL.md
@@ -0,0 +1,115 @@
+---
+name: nemotron-customizer-airgap
+description: Prepare, validate, build, and use Nemotron Customizer airgap image bundles for offline clusters. Use when planning airgapped deployments, editing deploy/nemotron-customizer/airgap/airgap.yaml, selecting workflow targets, grouping step execution images, baking repo overlays or wheel additions, resuming airgap runner builds, or submitting `nemotron steps run` jobs inside an airgapped environment.
+---
+
+# Nemotron Customizer Airgap
+
+Use this skill to help an agent produce a connected-machine airgap bundle and
+then submit Nemotron Customizer steps from the airgapped side. Keep it grounded
+in the checked-in runner and manifests; do not invent a parallel packaging flow.
+
+## Read First
+
+- `deploy/nemotron-customizer/airgap/README.md` for the operator flow.
+- `deploy/nemotron-customizer/airgap/airgap.yaml` for the current image map.
+- `deploy/nemotron-customizer/airgap/runner.py` when changing behavior.
+- `tests/deploy/test_airgap_runner.py` before editing runner logic.
+- `deploy/nemotron-customizer/airgap/configs/` for runtime overlay configs.
+
+For selected steps, inspect the catalog through the CLI:
+
+```bash
+uv run nemotron steps show <step_id> --json
+```
+
+## Workflow
+
+1. Establish the side of the workflow:
+   - Connected machine: validate, build, save image tarballs.
+   - Airgapped side: load images, set env profiles, run selected steps.
+
+2. Gather the minimum inputs:
+   - Target steps and config names, for example `sft/megatron_bridge:tiny`.
+   - Target architecture or Docker platform, for example `linux/amd64`.
+   - Available base images and whether the connected machine can pull them.
+   - Airgapped env profile name, mounts, model/data/checkpoint locations.
+   - Whether destructive or expensive actions such as `--execute`, Docker build,
+     Docker volume cleanup, or state-file removal are explicitly allowed.
+
+3. Plan with the runner first:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml
+```
+
+Use `--target <step_id>:<config>` for one-off selections without editing YAML.
+The runner expands dependencies from `dependencies`, validates selected step
+files/configs, groups execution images, and prints selected execution images.
+
+4. Edit `airgap.yaml` only where the runner expects configuration:
+   - `workflow.stages` or CLI `--target` for selected customer steps.
+   - `dependencies` for explicit upstream Nemotron Customizer step outputs.
+   - `step_execution_images` for step-to-image mapping.
+   - `execution_images` for base image, tag, tar, platform, and import probes.
+   - `launcher_image` for the launcher container.
+
+5. Execute only when the user asks for a real build:
+
+```bash
+uv run python deploy/nemotron-customizer/airgap/runner.py \
+  --config deploy/nemotron-customizer/airgap/airgap.yaml \
+  --execute
+```
+
+If a build fails midway, keep `airgap-build-state.yaml` and rerun the same
+command. Remove or move that state only when intentionally changing the plan.
+
+6. On the airgapped side, use images from `out/airgap-manifest.yaml` under
+`step_execution_images`. Submit with the plural CLI:
+
+```bash
+uv run nemotron steps run <step_id> \
+  -c <config-or-airgap-overlay> \
+  -b <airgap-profile> \
+  run.env.container_image=<image-from-manifest>
+```
+
+For `sft/megatron_bridge`, prefer the airgap overlay configs under
+`deploy/nemotron-customizer/airgap/configs/`; they clear runtime git auto-mounts
+because the runner bakes those repos into the execution image.
+
+## Guardrails
+
+- Keep models, datasets, checkpoints, secrets, and customer files out of images.
+  Put them on persistent storage and reference them through config overrides and
+  `run.env.mounts`.
+- Treat `${auto_mount:git+...}` as a connected-machine build input. The runner
+  bakes pinned repo overlays into execution images so airgapped jobs do not clone
+  from GitHub.
+- Do not add missing packages blindly. Let `discover-execution-deps` and
+  import probes determine small additions; keep heavyweight framework deps in
+  the base image choice.
+- Preserve offline defaults unless the user has an internal mirror:
+  `HF_HUB_OFFLINE=1`, `TRANSFORMERS_OFFLINE=1`, `HF_DATASETS_OFFLINE=1`,
+  and `WANDB_MODE=offline`.
+- Use `nemotron steps ...`; do not reintroduce `nemotron step ...`.
+
+## Validation
+
+After edits to runner logic, YAML structure, or airgap docs, run:
+
+```bash
+uv run pytest tests/deploy/test_airgap_runner.py -q
+```
+
+For CLI-facing examples, also smoke the command shape:
+
+```bash
+uv run nemotron steps --help
+uv run nemotron steps show data_prep/sft_packing --json
+```
+
+Do not run Docker build/save stages during validation unless the user explicitly
+asked for a real connected-machine bundle build.
diff --git a/deploy/nemotron-customizer/airgap/airgap.yaml b/deploy/nemotron-customizer/airgap/airgap.yaml
new file mode 100644
index 000000000..61ae65c46
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/airgap.yaml
@@ -0,0 +1,129 @@
+# One file controls the Nemotron Customizer airgap plan.
+#
+# Change workflow.stages to the steps the customer wants. The runner expands
+# dependencies, validates those step files/configs, groups selected steps by
+# execution image, then builds only the images needed for that selection.
+
+workflow:
+  name: sft-megatron-bridge
+  stages:
+    - sft/megatron_bridge:tiny
+  # Example SDG-only run:
+  # stages:
+  #   - sdg/data_designer:tiny
+  # Example SDG -> SFT run:
+  # stages:
+  #   - sdg/data_designer:tiny
+  #   - sft/megatron_bridge:tiny
+
+build_stages:
+  - validate
+  - discover-execution-deps
+  - build-launcher-image
+  - build-execution-images
+  - save-images
+
+paths:
+  output_dir: deploy/nemotron-customizer/airgap/out
+
+launcher_image:
+  base_image: python:3.12-slim
+  tag: nemotron-customizer-launcher-airgap:latest
+  tar: launcher-image.tar
+
+# Central dependency map. Keep this small and explicit: it is only for steps
+# that naturally require a previous Nemotron Customizer step output.
+dependencies:
+  sft/megatron_bridge:
+    - data_prep/sft_packing:tiny
+  peft/megatron_bridge:
+    - data_prep/sft_packing:tiny
+  pretrain/megatron_bridge:
+    - data_prep/pretrain_prep:tiny
+  pretrain/automodel:
+    - data_prep/pretrain_prep:tiny
+  rl/nemo_rl/dpo:
+    - data_prep/rl_prep:tiny
+  rl/nemo_rl/rlhf:
+    - data_prep/rl_prep:tiny
+  rl/nemo_rl/rlvr:
+    - data_prep/rl_prep:tiny
+  # SDG can feed SFT or RL prep, but it is not forced as a dependency because
+  # many customers bring their own JSONL on persistent storage.
+
+# Step -> execution-image mapping. The runner only uses entries reached from
+# workflow.stages after dependency expansion.
+step_execution_images:
+  byob/mcq: nemo-data-designer
+  convert/hf_to_megatron: nemo-megatron
+  convert/megatron_to_hf: nemo-megatron
+  convert/merge_lora: nemo-megatron
+  curate/nemo_curator: nemo-curator
+  env/env_toml: launcher-python
+  eval/model_eval: nemo-eval
+  optimize/modelopt/distill: nemo-modelopt
+  optimize/modelopt/prune: nemo-modelopt
+  optimize/modelopt/quantize: nemo-modelopt
+  peft/automodel: nemo-automodel
+  peft/megatron_bridge: nemo-megatron
+  data_prep/pretrain_prep: nemo-megatron
+  data_prep/rl_prep: nemo-rl
+  data_prep/sft_packing: nemo-megatron
+  pretrain/automodel: nemo-automodel
+  pretrain/megatron_bridge: nemo-megatron
+  rl/nemo_rl/dpo: nemo-rl
+  rl/nemo_rl/rlhf: nemo-rl
+  rl/nemo_rl/rlvr: nemo-rl
+  sdg/data_designer: nemo-data-designer
+  sft/automodel: nemo-automodel
+  sft/megatron_bridge: nemo-megatron
+  translate/nemo_curator: nemo-curator
+
+execution_images:
+  launcher-python:
+    base_image: python:3.12-slim
+    tag: nemotron-customizer-python-execution-airgap:latest
+    tar: execution-python-image.tar
+
+  nemo-megatron:
+    base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano
+    tag: nemotron-customizer-nemo-megatron-airgap:latest
+    tar: execution-nemo-megatron-image.tar
+    required_imports: []
+
+  nemo-automodel:
+    base_image: nvcr.io/nvidia/nemo-automodel:26.04
+    tag: nemotron-customizer-nemo-automodel-airgap:latest
+    tar: execution-nemo-automodel-image.tar
+    required_imports: []
+
+  nemo-rl:
+    base_image: nvcr.io/nvidia/nemo-rl:v0.6.0
+    tag: nemotron-customizer-nemo-rl-airgap:latest
+    tar: execution-nemo-rl-image.tar
+    required_imports: []
+
+  nemo-modelopt:
+    base_image: nvcr.io/nvidia/nemo:26.02
+    tag: nemotron-customizer-nemo-modelopt-airgap:latest
+    tar: execution-nemo-modelopt-image.tar
+    required_imports: []
+
+  nemo-curator:
+    base_image: nvcr.io/nvidia/nemo-curator:25.07
+    tag: nemotron-customizer-nemo-curator-airgap:latest
+    tar: execution-nemo-curator-image.tar
+    required_imports: []
+
+  nemo-data-designer:
+    base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano
+    tag: nemotron-customizer-nemo-data-designer-airgap:latest
+    tar: execution-nemo-data-designer-image.tar
+    required_imports:
+      - data_designer
+
+  nemo-eval:
+    base_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano
+    tag: nemotron-customizer-nemo-eval-airgap:latest
+    tar: execution-nemo-eval-image.tar
+    required_imports: []
diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml
new file mode 100644
index 000000000..a2e4b828c
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_default.yaml
@@ -0,0 +1,12 @@
+# Airgap runtime overlay for sft/megatron_bridge:default.
+#
+# The connected-machine airgap runner bakes the auto_mount repos from the base
+# config into the derivative execution image. At runtime, clear those git auto-mounts
+# so the airgapped job does not clone from GitHub. Env-profile persistent
+# storage mounts still append normally.
+
+defaults: ../../../../src/nemotron/steps/sft/megatron_bridge/config/default.yaml
+
+run:
+  env:
+    mounts: []
diff --git a/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml
new file mode 100644
index 000000000..eb71f5f96
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/configs/sft_megatron_bridge_tiny.yaml
@@ -0,0 +1,12 @@
+# Airgap runtime overlay for sft/megatron_bridge:tiny.
+#
+# The connected-machine airgap runner bakes the auto_mount repos from the base
+# config into the derivative execution image. At runtime, clear those git auto-mounts
+# so the airgapped job does not clone from GitHub. Env-profile persistent
+# storage mounts still append normally.
+
+defaults: ../../../../src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml
+
+run:
+  env:
+    mounts: []
diff --git a/deploy/nemotron-customizer/airgap/runner.py b/deploy/nemotron-customizer/airgap/runner.py
new file mode 100644
index 000000000..c6cf33d4e
--- /dev/null
+++ b/deploy/nemotron-customizer/airgap/runner.py
@@ -0,0 +1,1244 @@
+#!/usr/bin/env python3
+"""Lightweight airgap image runner for Nemotron Customizer.
+
+This file intentionally lives under deploy/nemotron-customizer/airgap instead
+of adding a new step. It is a connected-machine helper that validates requested
+steps, discovers small execution-image Python gaps, builds launcher/execution images, and
+saves image tarballs.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import hashlib
+import importlib.metadata as metadata
+import json
+import re
+import shutil
+import subprocess
+import sys
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+import tomllib
+import yaml
+
+AIRGAP_DIR = Path(__file__).resolve().parent
+REPO_ROOT = AIRGAP_DIR.parents[2]
+SRC_ROOT = REPO_ROOT / "src"
+STEP_ROOT = SRC_ROOT / "nemotron" / "steps"
+DEFAULT_OUTPUT_DIR = AIRGAP_DIR / "out"
+UV_VERSION = "0.11.1"
+PROGRESS_STATE = "airgap-build-state.yaml"
+COMPLETE_STATE = "airgap-build-complete.yaml"
+LOCAL_PREFIXES = ("nemotron", "nemo_runspec")
+CORE_IMPORTS = {
+    "datasets",
+    "megatron",
+    "nemo",
+    "numpy",
+    "ray",
+    "torch",
+    "transformers",
+    "triton",
+    "vllm",
+}
+IMPORT_ALIASES = {
+    "yaml": "pyyaml",
+    "pydantic_settings": "pydantic-settings",
+    "huggingface_hub": "huggingface-hub",
+    "cosmos_xenna": "cosmos-xenna",
+    "data_designer": "data-designer",
+    "nemo_curator": "nemo-curator",
+}
+
+
+@dataclass(frozen=True)
+class Target:
+    step_id: str
+    config: str | None = None
+
+    @property
+    def spec(self) -> str:
+        return f"{self.step_id}:{self.config}" if self.config else self.step_id
+
+
+@dataclass
+class StepInfo:
+    target: Target
+    step_dir: Path
+    step_py: Path
+    step_toml: Path
+    config_path: Path | None
+    module: str
+    mounts: list[Any] = field(default_factory=list)
+    repo_overlays: list[RepoOverlay] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class RepoOverlay:
+    repo: str
+    url: str
+    ref: str
+    target: str
+
+
+@dataclass
+class ExecutionGroup:
+    name: str
+    base_image: str
+    tag: str
+    tar: Path
+    steps: list[str]
+    platform: str | None = None
+    required_imports: set[str] = field(default_factory=set)
+    repo_overlays: list[RepoOverlay] = field(default_factory=list)
+    pip_no_deps: bool = True
+    candidate_imports: set[str] = field(default_factory=set)
+    missing_imports: list[str] = field(default_factory=list)
+    missing_core_imports: list[str] = field(default_factory=list)
+    requirements: list[str] = field(default_factory=list)
+    requirements_path: Path | None = None
+    repo_overlays_path: Path | None = None
+    selected_image: str | None = None
+    image_names: set[str] = field(default_factory=set)
+
+
+@dataclass
+class RunState:
+    path: Path
+    done_path: Path
+    data: dict[str, Any]
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Build Nemotron Customizer airgap images from one YAML file.")
+    parser.add_argument("--config", default=str(AIRGAP_DIR / "airgap.yaml"), help="Airgap runner YAML.")
+    parser.add_argument("--execute", action="store_true", help="Run docker/git commands. Default prints the plan.")
+    parser.add_argument("--stage", action="append", help="Stage to run. Repeatable. Defaults to config stages.")
+    parser.add_argument(
+        "--target",
+        action="append",
+        help="Nemotron step target step-id[:config]. Repeatable. Overrides workflow.stages.",
+    )
+    args = parser.parse_args(argv)
+
+    config_path = resolve_input_path(Path(args.config))
+    cfg = load_yaml(config_path)
+    if args.target:
+        cfg = with_workflow_targets(cfg, normalize_target_specs(args.target))
+    stages = normalize_stages(args.stage or cfg.get("build_stages") or cfg.get("stages") or [])
+    output_dir = resolve_repo_path(Path(cfg.get("paths", {}).get("output_dir", DEFAULT_OUTPUT_DIR)))
+    if "build-execution-images" in stages:
+        validate_docker_context_path(output_dir, field="paths.output_dir")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    run_state = load_or_start_run_state(
+        output_dir,
+        config_path=config_path,
+        cfg=cfg,
+        stages=stages,
+        execute=args.execute,
+    )
+    saved_images: list[dict[str, Any]] = []
+    workflow = cfg.get("workflow") if isinstance(cfg.get("workflow"), Mapping) else {}
+
+    print(f"[airgap] config={config_path}")
+    print(f"[airgap] mode={'execute' if args.execute else 'plan'}")
+    print(f"[airgap] stages={', '.join(stages)}")
+
+    expanded_targets: list[Target] = []
+    step_infos: dict[str, StepInfo] = {}
+    groups: list[ExecutionGroup] = []
+    workflow_manifest: dict[str, Any] = {
+        "stages": list(workflow.get("stages") or []),
+    }
+    if workflow.get("name"):
+        workflow_manifest["name"] = workflow.get("name")
+    manifest: dict[str, Any] = {
+        "schema_version": 1,
+        "workflow": workflow_manifest,
+        "output_dir": str(output_dir),
+        "build_stages": stages,
+    }
+
+    if "validate" in stages or any(stage_needs_targets(stage) for stage in stages):
+        begin_action(run_state, "validate")
+        expanded_targets = expand_targets(cfg)
+        step_infos = validate_targets(expanded_targets)
+        manifest["targets"] = [step_to_manifest(info) for info in step_infos.values()]
+        print(f"[validate] {len(step_infos)} target(s) ok")
+        complete_action(run_state, "validate", {"targets": [target.spec for target in expanded_targets]})
+
+    if any(stage in stages for stage in ("discover-execution-deps", "build-execution-images", "save-images")):
+        groups = execution_groups(cfg, output_dir=output_dir, step_infos=step_infos)
+        manifest["execution_groups"] = [execution_group_manifest(group) for group in groups]
+
+    if "discover-execution-deps" in stages:
+        if action_completed(run_state, "discover-execution-deps") and hydrate_discovered_groups(run_state, groups):
+            print("[resume] skipping discover-execution-deps; using saved probe results")
+        else:
+            begin_action(run_state, "discover-execution-deps")
+            locked_versions = locked_package_versions(REPO_ROOT / "uv.lock")
+            for group in groups:
+                discover_execution_deps(
+                    group,
+                    step_infos=step_infos,
+                    locked_versions=locked_versions,
+                    execute=args.execute,
+                )
+            remember_discovered_groups(run_state, groups)
+            complete_action(run_state, "discover-execution-deps", {"groups": [group.name for group in groups]})
+        manifest["execution_groups"] = [execution_group_manifest(group) for group in groups]
+
+    if "build-launcher-image" in stages:
+        launcher_image = cfg.get("launcher_image", {})
+        launcher_image_tag = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest")
+        platform = launcher_image_platform(launcher_image)
+        action = "build-launcher-image"
+        if action_completed(run_state, action) and docker_image_exists(launcher_image_tag, platform=platform):
+            print(f"[resume] skipping {action}; image exists: {launcher_image_tag}")
+        else:
+            begin_action(run_state, action)
+            status = build_launcher_image(launcher_image, execute=args.execute)
+            if status:
+                return status
+            complete_action(run_state, action, {"image": launcher_image_tag})
+        manifest["launcher_image"] = launcher_image_manifest(launcher_image)
+
+    if "build-execution-images" in stages:
+        clean_stale_group_dirs(output_dir, groups, execute=args.execute)
+        for group in groups:
+            action = f"build-execution-image:{group.name}"
+            if action_completed(run_state, action) and docker_image_exists(group.tag, platform=group.platform):
+                print(f"[resume] skipping {action}; image exists: {group.tag}")
+            else:
+                begin_action(run_state, action)
+                status = build_execution_image(group, output_dir=output_dir, execute=args.execute)
+                if status:
+                    return status
+                complete_action(run_state, action, {"image": group.tag})
+        manifest["execution_groups"] = [execution_group_manifest(group) for group in groups]
+
+    if "save-images" in stages:
+        launcher_image = cfg.get("launcher_image", {})
+        if launcher_image:
+            output = output_dir / str(launcher_image.get("tar", "launcher-image.tar"))
+            launcher_image_tag = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest")
+            action = f"save-image:{launcher_image_tag}"
+            if action_completed(run_state, action) and output.exists():
+                print(f"[resume] skipping {action}; tar exists: {output}")
+            else:
+                begin_action(run_state, action)
+                status = save_image(launcher_image_tag, output, args.execute)
+                if status:
+                    return status
+                complete_action(run_state, action, {"tar": str(output)})
+            saved_images.append(
+                saved_image_manifest(
+                    launcher_image_tag,
+                    output,
+                    execute=args.execute,
+                    role="launcher",
+                    name="launcher",
+                )
+            )
+        for group in groups:
+            action = f"save-image:{group.tag}"
+            if action_completed(run_state, action) and group.tar.exists():
+                print(f"[resume] skipping {action}; tar exists: {group.tar}")
+            else:
+                begin_action(run_state, action)
+                status = save_image(group.tag, group.tar, args.execute)
+                if status:
+                    return status
+                complete_action(run_state, action, {"tar": str(group.tar)})
+            saved_images.append(
+                saved_image_manifest(group.tag, group.tar, execute=args.execute, role="execution", name=group.name)
+            )
+
+    manifest["persistent_assets"] = {
+        "policy": "models, datasets, checkpoints, and customer data stay on executor-visible persistent storage",
+        "mounts_from_configs": collect_mounts(step_infos.values()),
+        "baked_repo_overlays": [repo_overlay_manifest(item) for item in collect_repo_overlays(step_infos.values())],
+    }
+    manifest["step_execution_images"] = step_execution_image_manifest(groups)
+    manifest["saved_images"] = saved_images
+    manifest_path = output_dir / "airgap-manifest.yaml"
+    manifest_path.write_text(yaml.safe_dump(manifest, sort_keys=False), encoding="utf-8")
+    complete_run_state(run_state, manifest_path=manifest_path)
+    print(f"[airgap] wrote {manifest_path}")
+    if groups:
+        print("[airgap] selected execution images:")
+        for group in groups:
+            image = group.selected_image or group.tag
+            for step_id in group.steps:
+                print(f"  - {step_id}: {image}")
+    return 0
+
+
+def load_yaml(path: Path) -> dict[str, Any]:
+    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+    if not isinstance(data, dict):
+        raise SystemExit(f"{path}: top-level YAML must be a mapping")
+    return data
+
+
+def normalize_target_specs(values: Iterable[str]) -> list[str]:
+    out: list[str] = []
+    for raw in values:
+        for item in str(raw).split(","):
+            target = item.strip()
+            if target:
+                out.append(target)
+    return out
+
+
+def with_workflow_targets(cfg: Mapping[str, Any], targets: list[str]) -> dict[str, Any]:
+    out = dict(cfg)
+    existing = out.get("workflow")
+    workflow = dict(existing) if isinstance(existing, Mapping) else {}
+    workflow["stages"] = targets
+    out["workflow"] = workflow
+    return out
+
+
+def resolve_input_path(path: Path) -> Path:
+    if path.is_absolute() or path.exists():
+        return path
+    repo_path = REPO_ROOT / path
+    return repo_path if repo_path.exists() else path
+
+
+def resolve_repo_path(path: Path) -> Path:
+    return path if path.is_absolute() else REPO_ROOT / path
+
+
+def docker_context_path(path: Path) -> str:
+    resolved = path.resolve()
+    try:
+        return resolved.relative_to(REPO_ROOT).as_posix()
+    except ValueError as exc:
+        raise SystemExit(f"{path} must live under the repo root because docker build context is {REPO_ROOT}") from exc
+
+
+def validate_docker_context_path(path: Path, *, field: str) -> None:
+    try:
+        docker_context_path(path)
+    except SystemExit as exc:
+        message = f"{field}={path} must live under the repo root because Docker builds use {REPO_ROOT}"
+        raise SystemExit(message) from exc
+
+
+def load_or_start_run_state(
+    output_dir: Path,
+    *,
+    config_path: Path,
+    cfg: Mapping[str, Any],
+    stages: list[str],
+    execute: bool,
+) -> RunState | None:
+    if not execute:
+        return None
+    path = output_dir / PROGRESS_STATE
+    done_path = output_dir / COMPLETE_STATE
+    signature = run_signature(config_path=config_path, cfg=cfg, stages=stages)
+    if path.exists():
+        data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
+        if not isinstance(data, dict):
+            raise SystemExit(f"{path} must contain YAML mapping state")
+        if data.get("signature") != signature:
+            raise SystemExit(
+                f"{path} is an incomplete airgap run for a different plan. "
+                f"Finish it, move it aside, or remove it before starting a new plan."
+            )
+        print(f"[resume] found incomplete run state: {path}")
+        return RunState(path=path, done_path=done_path, data=data)
+
+    workflow = cfg.get("workflow") if isinstance(cfg.get("workflow"), Mapping) else {}
+    data = {
+        "schema_version": 1,
+        "signature": signature,
+        "config": str(config_path.resolve()),
+        "workflow_stages": list(workflow.get("stages") or []),
+        "build_stages": stages,
+        "started_at": timestamp(),
+        "completed_actions": {},
+        "discovered_groups": {},
+    }
+    if done_path.exists():
+        data["previous_complete"] = str(done_path)
+    state = RunState(path=path, done_path=done_path, data=data)
+    write_run_state(state)
+    print(f"[airgap] progress state={path}")
+    return state
+
+
+def run_signature(*, config_path: Path, cfg: Mapping[str, Any], stages: list[str]) -> str:
+    payload = {
+        "config": str(config_path.resolve()),
+        "stages": stages,
+        "workflow": cfg.get("workflow"),
+        "dependencies": cfg.get("dependencies"),
+        "step_execution_images": cfg.get("step_execution_images"),
+        "execution_images": cfg.get("execution_images"),
+        "launcher_image": cfg.get("launcher_image"),
+    }
+    text = yaml.safe_dump(payload, sort_keys=True)
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]
+
+
+def timestamp() -> str:
+    return datetime.now(UTC).isoformat(timespec="seconds").replace("+00:00", "Z")
+
+
+def write_run_state(state: RunState | None) -> None:
+    if state is None:
+        return
+    state.data["updated_at"] = timestamp()
+    state.path.write_text(yaml.safe_dump(state.data, sort_keys=False), encoding="utf-8")
+
+
+def begin_action(state: RunState | None, action: str) -> None:
+    if state is None:
+        return
+    state.data["current_action"] = {"name": action, "started_at": timestamp()}
+    write_run_state(state)
+
+
+def complete_action(state: RunState | None, action: str, details: Mapping[str, Any] | None = None) -> None:
+    if state is None:
+        return
+    completed = state.data.setdefault("completed_actions", {})
+    completed[action] = {"completed_at": timestamp(), **dict(details or {})}
+    if (state.data.get("current_action") or {}).get("name") == action:
+        state.data.pop("current_action", None)
+    write_run_state(state)
+
+
+def action_completed(state: RunState | None, action: str) -> bool:
+    if state is None:
+        return False
+    return action in (state.data.get("completed_actions") or {})
+
+
+def remember_discovered_groups(state: RunState | None, groups: Iterable[ExecutionGroup]) -> None:
+    if state is None:
+        return
+    state.data["discovered_groups"] = {
+        group.name: {
+            "candidate_imports": sorted(group.candidate_imports),
+            "missing_imports": group.missing_imports,
+            "missing_core_imports": group.missing_core_imports,
+            "requirements": group.requirements,
+        }
+        for group in groups
+    }
+    write_run_state(state)
+
+
+def hydrate_discovered_groups(state: RunState | None, groups: Iterable[ExecutionGroup]) -> bool:
+    if state is None:
+        return False
+    saved = state.data.get("discovered_groups") or {}
+    groups = list(groups)
+    if not all(group.name in saved for group in groups):
+        return False
+    for group in groups:
+        item = saved[group.name]
+        group.candidate_imports = set(item.get("candidate_imports") or [])
+        group.missing_imports = list(item.get("missing_imports") or [])
+        group.missing_core_imports = list(item.get("missing_core_imports") or [])
+        group.requirements = list(item.get("requirements") or [])
+    return True
+
+
+def complete_run_state(state: RunState | None, *, manifest_path: Path) -> None:
+    if state is None:
+        return
+    state.data.pop("current_action", None)
+    state.data["manifest"] = str(manifest_path)
+    state.data["completed_at"] = timestamp()
+    state.done_path.write_text(yaml.safe_dump(state.data, sort_keys=False), encoding="utf-8")
+    state.path.unlink(missing_ok=True)
+    print(f"[airgap] complete state={state.done_path}")
+
+
+def normalize_stages(stages: Iterable[str]) -> list[str]:
+    out: list[str] = []
+    for raw in stages:
+        for item in str(raw).split(","):
+            stage = item.strip()
+            if stage and stage not in out:
+                out.append(stage)
+    out = out or [
+        "validate",
+        "discover-execution-deps",
+        "build-launcher-image",
+        "build-execution-images",
+        "save-images",
+    ]
+
+    def ensure_before(required: str, requested: str) -> None:
+        if requested not in out or required in out:
+            return
+        index = out.index(requested)
+        out.insert(index, required)
+        print(f"[airgap] auto-adding stage {required!r} because {requested!r} was requested")
+
+    # Apply prerequisite edges from later stages toward earlier stages. Each
+    # insertion is idempotent, so a user can ask for any suffix of the pipeline.
+    ensure_before("build-execution-images", "save-images")
+    ensure_before("build-launcher-image", "save-images")
+    ensure_before("discover-execution-deps", "build-execution-images")
+    ensure_before("validate", "discover-execution-deps")
+    ensure_before("validate", "build-execution-images")
+    ensure_before("validate", "save-images")
+    order = {
+        "validate": 0,
+        "discover-execution-deps": 1,
+        "build-launcher-image": 2,
+        "build-execution-images": 3,
+        "save-images": 4,
+    }
+    out.sort(key=lambda stage: order.get(stage, len(order)))
+    return out
+
+
+def stage_needs_targets(stage: str) -> bool:
+    return stage in {"discover-execution-deps", "build-execution-images", "save-images"}
+
+
+def expand_targets(cfg: Mapping[str, Any]) -> list[Target]:
+    workflow = cfg.get("workflow") or {}
+    raw_targets = [parse_target(item) for item in workflow.get("stages") or []]
+    deps = cfg.get("dependencies") or workflow.get("dependencies") or {}
+    out: list[Target] = []
+    seen: set[str] = set()
+    visiting: set[str] = set()
+    stack: list[str] = []
+
+    def add(target: Target) -> None:
+        if target.spec in visiting:
+            start = stack.index(target.spec) if target.spec in stack else 0
+            cycle = " -> ".join([*stack[start:], target.spec])
+            raise SystemExit(f"cyclic airgap dependency detected: {cycle}")
+        if target.spec in seen:
+            return
+        visiting.add(target.spec)
+        stack.append(target.spec)
+        for dep in deps.get(target.step_id, []) or []:
+            add(parse_target(dep))
+        stack.pop()
+        visiting.remove(target.spec)
+        seen.add(target.spec)
+        out.append(target)
+
+    for target in raw_targets:
+        add(target)
+    if not out:
+        raise SystemExit("workflow.stages must list at least one step")
+    return out
+
+
+def parse_target(value: str) -> Target:
+    step_id, sep, config = str(value).partition(":")
+    step_id = step_id.strip()
+    config = config.strip() if sep else ""
+    if not step_id:
+        raise SystemExit(f"invalid target {value!r}; expected step-id[:config]")
+    return Target(step_id=step_id, config=config or None)
+
+
+def validate_targets(targets: Iterable[Target]) -> dict[str, StepInfo]:
+    out: dict[str, StepInfo] = {}
+    for target in targets:
+        step_dir = STEP_ROOT / target.step_id
+        step_py = step_dir / "step.py"
+        step_toml = step_dir / "step.toml"
+        config_path = step_dir / "config" / f"{target.config}.yaml" if target.config else None
+        missing = [
+            path for path in (step_dir, step_py, step_toml, config_path) if path is not None and not path.exists()
+        ]
+        if missing:
+            raise SystemExit(f"{target.spec}: missing required path(s): {', '.join(str(path) for path in missing)}")
+        module = "nemotron.steps." + target.step_id.replace("/", ".") + ".step"
+        out[target.step_id] = StepInfo(
+            target=target,
+            step_dir=step_dir,
+            step_py=step_py,
+            step_toml=step_toml,
+            config_path=config_path,
+            module=module,
+            mounts=read_config_mounts(config_path),
+            repo_overlays=read_config_repo_overlays(config_path),
+        )
+    return out
+
+
+def read_config_mounts(config_path: Path | None) -> list[Any]:
+    if config_path is None or not config_path.exists():
+        return []
+    try:
+        data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
+    except Exception:
+        return []
+    if not isinstance(data, Mapping):
+        return []
+    run = data.get("run") if isinstance(data.get("run"), Mapping) else {}
+    env = run.get("env") if isinstance(run.get("env"), Mapping) else {}
+    mounts = env.get("mounts") if isinstance(env, Mapping) else []
+    return mounts if isinstance(mounts, list) else []
+
+
+def execution_groups(
+    cfg: Mapping[str, Any],
+    *,
+    output_dir: Path,
+    step_infos: Mapping[str, StepInfo] | None = None,
+) -> list[ExecutionGroup]:
+    if not step_infos:
+        raise SystemExit("validate must run before execution images can be planned")
+    if not cfg.get("step_execution_images"):
+        raise SystemExit("airgap.yaml must define step_execution_images for the selected workflow stages")
+    return execution_groups_from_step_execution_images(cfg, output_dir=output_dir, step_infos=step_infos)
+
+
+def execution_groups_from_step_execution_images(
+    cfg: Mapping[str, Any],
+    *,
+    output_dir: Path,
+    step_infos: Mapping[str, StepInfo],
+) -> list[ExecutionGroup]:
+    step_execution_images = normalize_step_execution_images(cfg.get("step_execution_images") or {})
+    image_defs = normalize_execution_images(cfg.get("execution_images") or {})
+    merged: dict[str, ExecutionGroup] = {}
+
+    for step_id in step_infos:
+        image_name = step_execution_images.get(step_id)
+        if not image_name:
+            raise SystemExit(f"{step_id}: missing step_execution_images entry in airgap.yaml")
+        image_def = image_defs.get(image_name)
+        if image_def is None:
+            raise SystemExit(f"{step_id}: step_execution_images points to unknown execution image {image_name!r}")
+        base = str(image_def.get("base_image") or "").strip()
+        if not base:
+            raise SystemExit(f"execution_images.{image_name}.base_image is required")
+        repo_overlays = getattr(step_infos[step_id], "repo_overlays", [])
+        group_key = execution_group_key(base, repo_overlays)
+        group = merged.get(group_key)
+        if group is None:
+            suffix = short_hash(
+                {
+                    "base_image": base,
+                    "repo_overlays": [repo_overlay_manifest(item) for item in repo_overlays],
+                }
+            )
+            group = ExecutionGroup(
+                name=f"{image_name}-{suffix}",
+                base_image=base,
+                tag="",
+                tar=output_dir / "execution-image.tar",
+                steps=[],
+                platform=str(image_def["platform"]) if image_def.get("platform") else None,
+                pip_no_deps=bool(image_def.get("pip_no_deps", True)),
+                repo_overlays=list(repo_overlays),
+            )
+            merged[group_key] = group
+        group.image_names.add(image_name)
+        group.steps.append(step_id)
+        group.required_imports.update(str(name) for name in image_def.get("required_imports") or [])
+        group.repo_overlays = merge_repo_overlays(
+            group.repo_overlays,
+            repo_overlays,
+        )
+    for group in merged.values():
+        finalize_execution_group_name(group, image_defs=image_defs, output_dir=output_dir)
+    return list(merged.values())
+
+
+def finalize_execution_group_name(
+    group: ExecutionGroup,
+    *,
+    image_defs: Mapping[str, Mapping[str, Any]],
+    output_dir: Path,
+) -> None:
+    names = sorted(group.image_names)
+    suffix = short_hash(
+        {
+            "base_image": group.base_image,
+            "repo_overlays": [repo_overlay_manifest(item) for item in group.repo_overlays],
+        }
+    )
+    if len(names) == 1:
+        image_name = names[0]
+        image_def = image_defs[image_name]
+        tag = str(image_def.get("tag") or f"nemotron-execution-{sanitize(image_name)}:airgap")
+        tar = output_dir / str(image_def.get("tar") or f"execution-{sanitize(image_name)}.tar")
+        group.name = f"{image_name}-{suffix}"
+    else:
+        merged_name = "-".join(sanitize(name) for name in names)
+        tag = f"nemotron-customizer-{merged_name}-airgap:latest"
+        tar = output_dir / f"execution-{merged_name}-image.tar"
+        group.name = f"{merged_name}-{suffix}"
+    group.tag = tag_with_suffix(tag, suffix)
+    group.tar = tar_with_suffix(tar, suffix)
+    group.selected_image = group.tag
+
+
+def execution_group_key(base_image: str, repo_overlays: Iterable[RepoOverlay]) -> str:
+    overlays = sorted(
+        (repo_overlay_manifest(item) for item in repo_overlays),
+        key=lambda item: (item["target"], item["url"], item["ref"], item["repo"]),
+    )
+    payload = {
+        "base_image": base_image,
+        "repo_overlays": overlays,
+    }
+    return json.dumps(payload, sort_keys=True)
+
+
+def short_hash(value: Any) -> str:
+    payload = json.dumps(value, sort_keys=True, separators=(",", ":")).encode("utf-8")
+    return hashlib.sha256(payload).hexdigest()[:8]
+
+
+def tag_with_suffix(tag: str, suffix: str) -> str:
+    image, separator, digest = tag.partition("@")
+    last = image.rsplit("/", 1)[-1]
+    if ":" in last:
+        name, version = image.rsplit(":", 1)
+        image = f"{name}-{suffix}:{version}"
+    else:
+        image = f"{image}-{suffix}"
+    return f"{image}{separator}{digest}" if separator else image
+
+
+def tar_with_suffix(path: Path, suffix: str) -> Path:
+    return path.with_name(f"{path.stem}-{suffix}{path.suffix}")
+
+
+def normalize_step_execution_images(raw: Mapping[str, Any]) -> dict[str, str]:
+    out: dict[str, str] = {}
+    for step_id, value in raw.items():
+        if isinstance(value, str):
+            out[str(step_id)] = value
+        elif isinstance(value, Mapping) and value.get("execution_image"):
+            out[str(step_id)] = str(value["execution_image"])
+    return out
+
+
+def normalize_execution_images(raw: Any) -> dict[str, Mapping[str, Any]]:
+    if isinstance(raw, Mapping):
+        return {str(name): spec for name, spec in raw.items() if isinstance(spec, Mapping)}
+    return {}
+
+
+def read_config_repo_overlays(config_path: Path | None) -> list[RepoOverlay]:
+    if config_path is None or not config_path.exists():
+        return []
+    text = config_path.read_text(encoding="utf-8")
+    overlays: list[RepoOverlay] = []
+    pattern = re.compile(r"\$\{auto_mount:(git\+[^,}]+),([^}]+)\}")
+    for spec, target in pattern.findall(text):
+        overlays.append(parse_git_overlay(spec, target))
+    return merge_repo_overlays([], overlays)
+
+
+def parse_git_overlay(spec: str, target: str) -> RepoOverlay:
+    if not spec.startswith("git+"):
+        raise SystemExit(f"invalid auto_mount git spec: {spec!r}")
+    url_and_ref = spec[4:]
+    if "@" not in url_and_ref:
+        raise SystemExit(f"invalid auto_mount git spec missing @ref: {spec!r}")
+    url, ref = url_and_ref.rsplit("@", 1)
+    repo = url.rstrip("/").split("/")[-1]
+    if repo.endswith(".git"):
+        repo = repo[:-4]
+    return RepoOverlay(repo=repo, url=url, ref=ref, target=target.strip())
+
+
+def merge_repo_overlays(existing: list[RepoOverlay], incoming: Iterable[RepoOverlay]) -> list[RepoOverlay]:
+    out = list(existing)
+    seen = {(item.repo, item.url, item.ref, item.target) for item in out}
+    for item in incoming:
+        key = (item.repo, item.url, item.ref, item.target)
+        if key not in seen:
+            out.append(item)
+            seen.add(key)
+    return out
+
+
+def discover_execution_deps(
+    group: ExecutionGroup,
+    *,
+    step_infos: Mapping[str, StepInfo],
+    locked_versions: Mapping[str, str],
+    execute: bool,
+) -> None:
+    imports: set[str] = set(group.required_imports)
+    for step_id in group.steps:
+        imports.update(discover_external_imports(step_infos[step_id].step_py))
+    group.candidate_imports = imports
+    if execute:
+        missing = probe_step_modules(
+            group.base_image,
+            [step_infos[step_id].module for step_id in group.steps],
+            required_imports=imports,
+            locked_versions=locked_versions,
+            pip_no_deps=group.pip_no_deps,
+            platform=group.platform,
+        )
+    else:
+        missing = probe_missing_imports(group.base_image, sorted(imports), execute=False, platform=group.platform)
+    group.missing_imports = sorted(set(missing))
+    group.missing_core_imports = sorted(name for name in missing if name.split(".", 1)[0] in CORE_IMPORTS)
+    installable = sorted(name for name in group.missing_imports if name not in group.missing_core_imports)
+    group.requirements = sorted(requirement_for_import(name, locked_versions) for name in installable)
+
+
+def discover_external_imports(start: Path) -> set[str]:
+    external: set[str] = set()
+    try:
+        tree = ast.parse(start.read_text(encoding="utf-8"))
+    except SyntaxError:
+        return external
+    for node in ast.walk(tree):
+        imported: list[str] = []
+        if isinstance(node, ast.Import):
+            imported = [alias.name for alias in node.names]
+        elif isinstance(node, ast.ImportFrom) and not node.level and node.module:
+            imported = [node.module]
+        for name in imported:
+            root = name.split(".", 1)[0]
+            if root in LOCAL_PREFIXES or is_stdlib(root):
+                continue
+            external.add(root)
+    return external
+
+
+def is_stdlib(root: str) -> bool:
+    if root in sys.builtin_module_names:
+        return True
+    stdlib_names = getattr(sys, "stdlib_module_names", set())
+    if root in stdlib_names:
+        return True
+    return False
+
+
+def probe_missing_imports(image: str, imports: list[str], *, execute: bool, platform: str | None = None) -> list[str]:
+    if not imports:
+        return []
+    code = (
+        "import importlib.util,json;"
+        f"mods={imports!r};"
+        "missing=[m for m in mods if importlib.util.find_spec(m) is None];"
+        "print(json.dumps(missing))"
+    )
+    cmd = ["docker", "run", "--rm", "--pull", "never"]
+    if platform:
+        cmd.extend(["--platform", platform])
+    cmd.extend([image, "python", "-c", code])
+    if not execute:
+        print_cmd(cmd)
+        return []
+    ensure_image(image, platform=platform)
+    result = subprocess.run(cmd, check=False, capture_output=True, text=True, cwd=REPO_ROOT)
+    if result.returncode != 0:
+        print(result.stderr or result.stdout, file=sys.stderr)
+        raise SystemExit(result.returncode)
+    return [str(item) for item in json.loads(result.stdout.strip() or "[]")]
+
+
+def probe_step_modules(
+    image: str,
+    modules: list[str],
+    *,
+    required_imports: Iterable[str],
+    locked_versions: Mapping[str, str],
+    pip_no_deps: bool,
+    platform: str | None = None,
+) -> list[str]:
+    """Import selected step modules in the execution image and discover missing imports.
+
+    The loop installs only the packages it has already identified, in an
+    ephemeral container, so the final requirements file stays based on actual
+    import failures rather than broad static guesses.
+    """
+
+    ensure_image(image, platform=platform)
+    missing: list[str] = []
+    requirements: list[str] = []
+    imports = sorted(set(required_imports))
+    import_code = "import importlib;"
+    import_code += "".join(f"importlib.import_module({module!r});" for module in imports)
+    import_code += "".join(f"importlib.import_module({module!r});" for module in modules)
+    for _ in range(20):
+        install = ""
+        if requirements:
+            no_deps = "--no-deps " if pip_no_deps else ""
+            install = "python -m pip install " + no_deps
+            install += " ".join(shlex_quote(req) for req in requirements)
+            install += (
+                " >/tmp/nemotron-airgap-pip.log 2>&1 "
+                "|| { echo '[airgap-pip] failed:'; cat /tmp/nemotron-airgap-pip.log; exit 1; } && "
+            )
+        cmd = [
+            "docker",
+            "run",
+            "--rm",
+            "--pull",
+            "never",
+            "--mount",
+            f"type=volume,source={pip_cache_volume(platform)},target=/root/.cache/pip",
+            "-v",
+            f"{REPO_ROOT}:/workspace/Nemotron:ro",
+            "-w",
+            "/workspace/Nemotron",
+            "-e",
+            "PYTHONPATH=/workspace/Nemotron/src",
+        ]
+        if platform:
+            cmd.extend(["--platform", platform])
+        cmd.extend([image, "bash", "-lc", install + "python -c " + shlex_quote(import_code)])
+        result = subprocess.run(cmd, check=False, capture_output=True, text=True, cwd=REPO_ROOT)
+        if result.returncode == 0:
+            return missing
+        text = result.stderr + "\n" + result.stdout
+        match = re.search(r"(?:ModuleNotFoundError|ImportError):\s+No module named ['\"]([^'\"]+)['\"]", text)
+        if not match:
+            print(text, file=sys.stderr)
+            raise SystemExit(result.returncode)
+        import_name = match.group(1).split(".", 1)[0]
+        if import_name not in missing:
+            missing.append(import_name)
+        if import_name in CORE_IMPORTS:
+            print(f"[probe] base image is missing core import {import_name!r}; choose a compatible execution image")
+            return missing
+        requirement = requirement_for_import(import_name, locked_versions)
+        if requirement in requirements:
+            return missing
+        requirements.append(requirement)
+    raise SystemExit(f"import probe did not converge for {image}")
+
+
+def requirement_for_import(import_name: str, locked_versions: Mapping[str, str]) -> str:
+    package = package_for_import(import_name)
+    version = locked_versions.get(normalize_package(package))
+    return f"{package}=={version}" if version else package
+
+
+def package_for_import(import_name: str) -> str:
+    if import_name in IMPORT_ALIASES:
+        return IMPORT_ALIASES[import_name]
+    packages = metadata.packages_distributions().get(import_name)
+    if packages:
+        return normalize_package(packages[0])
+    return import_name.replace("_", "-")
+
+
+def locked_package_versions(lock_path: Path) -> dict[str, str]:
+    if not lock_path.exists():
+        return {}
+    data = tomllib.loads(lock_path.read_text(encoding="utf-8"))
+    versions: dict[str, str] = {}
+    for package in data.get("package", []) or []:
+        name = package.get("name")
+        version = package.get("version")
+        if isinstance(name, str) and isinstance(version, str):
+            versions[normalize_package(name)] = version
+    return versions
+
+
+def normalize_package(name: str) -> str:
+    return re.sub(r"[-_.]+", "-", name).lower()
+
+
+def build_launcher_image(launcher_image: Mapping[str, Any], *, execute: bool) -> int:
+    image = str(launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest")
+    base = str(launcher_image.get("base_image") or "python:3.12-slim")
+    platform = launcher_image_platform(launcher_image)
+    cmd = [
+        "docker",
+        "build",
+        "-f",
+        str(AIRGAP_DIR / "Dockerfile.launcher"),
+        "--build-arg",
+        f"BASE_IMAGE={base}",
+        "--build-arg",
+        f"UV_VERSION={UV_VERSION}",
+        "-t",
+        image,
+        ".",
+    ]
+    if platform:
+        cmd[2:2] = ["--platform", platform]
+    if execute:
+        ensure_image(base, platform=platform)
+    return run_or_print(cmd, execute)
+
+
+def launcher_image_platform(launcher_image: Mapping[str, Any]) -> str | None:
+    return str(launcher_image["platform"]) if launcher_image.get("platform") else None
+
+
+def build_execution_image(group: ExecutionGroup, *, output_dir: Path, execute: bool) -> int:
+    group_dir = output_dir / "execution-context" / group.name
+    group_dir.mkdir(parents=True, exist_ok=True)
+    group.requirements_path = group_dir / f"requirements-{group.name}.txt"
+    group.requirements_path.write_text(
+        "\n".join(group.requirements) + ("\n" if group.requirements else ""),
+        encoding="utf-8",
+    )
+    repos_root = output_dir / "repo-overlays" / group.name
+    prepare_repo_overlays(group, repos_root=repos_root, execute=execute)
+    group.repo_overlays_path = group_dir / f"repo-overlays-{group.name}.json"
+    group.repo_overlays_path.write_text(
+        json.dumps([repo_overlay_build_manifest(item) for item in group.repo_overlays], indent=2) + "\n",
+        encoding="utf-8",
+    )
+    cmd = [
+        "docker",
+        "build",
+        "-f",
+        str(AIRGAP_DIR / "Dockerfile.execution"),
+        "--build-arg",
+        f"BASE_IMAGE={group.base_image}",
+        "--build-arg",
+        f"EXECUTION_REQUIREMENTS={docker_context_path(group.requirements_path)}",
+        "--build-arg",
+        f"REPO_OVERLAYS={docker_context_path(group.repo_overlays_path)}",
+        "--build-arg",
+        f"REPO_OVERLAYS_DIR={docker_context_path(repos_root)}",
+        "--build-arg",
+        f"PIP_NO_DEPS={'true' if group.pip_no_deps else 'false'}",
+        "-t",
+        group.tag,
+        ".",
+    ]
+    if group.platform:
+        cmd[2:2] = ["--platform", group.platform]
+    if execute:
+        ensure_image(group.base_image, platform=group.platform)
+    return run_or_print(cmd, execute)
+
+
+def prepare_repo_overlays(group: ExecutionGroup, *, repos_root: Path, execute: bool) -> None:
+    repos_root.mkdir(parents=True, exist_ok=True)
+    (repos_root / ".keep").touch()
+    for overlay in group.repo_overlays:
+        dest = repos_root / repo_overlay_dir_name(overlay)
+        if dest.exists():
+            run_or_print(["git", "-C", str(dest), "fetch", "--all", "--tags", "--force", "--prune"], execute)
+        else:
+            run_or_print(["git", "clone", overlay.url, str(dest)], execute)
+        run_or_print(["git", "-C", str(dest), "checkout", overlay.ref], execute)
+
+
+def save_image(image: str, output: Path, execute: bool) -> int:
+    return run_or_print(["docker", "save", "-o", str(output), image], execute, mkdir=output.parent)
+
+
+def ensure_image(image: str, *, platform: str | None = None) -> None:
+    if docker_image_exists(image, platform=platform):
+        return
+    suffix = f" for {platform}" if platform else ""
+    print(f"[docker] pulling missing base image{suffix}: {image}")
+    cmd = ["docker", "pull"]
+    if platform:
+        cmd.extend(["--platform", platform])
+    cmd.append(image)
+    result = subprocess.run(cmd, check=False, cwd=REPO_ROOT)
+    if result.returncode:
+        raise SystemExit(result.returncode)
+
+
+def docker_image_exists(image: str, *, platform: str | None = None) -> bool:
+    cached = docker_image_platform(image)
+    return cached is not None and platform_matches(cached, platform)
+
+
+def docker_image_platform(image: str) -> str | None:
+    inspect = subprocess.run(
+        [
+            "docker",
+            "image",
+            "inspect",
+            "--format",
+            "{{.Os}}/{{.Architecture}}{{if .Variant}}/{{.Variant}}{{end}}",
+            image,
+        ],
+        stdout=subprocess.PIPE,
+        text=True,
+        stderr=subprocess.DEVNULL,
+        cwd=REPO_ROOT,
+    )
+    if inspect.returncode != 0:
+        return None
+    return (inspect.stdout.strip().splitlines() or [None])[0]
+
+
+def platform_matches(cached: str | None, requested: str | None) -> bool:
+    if cached is None:
+        return False
+    if not requested:
+        return True
+    return cached == requested or cached.startswith(f"{requested}/")
+
+
+def pip_cache_volume(platform: str | None = None) -> str:
+    suffix = sanitize(platform or "default")
+    return f"nemotron-airgap-pip-cache-{suffix}"
+
+
+def run_or_print(cmd: list[str], execute: bool, *, mkdir: Path | None = None) -> int:
+    print_cmd(cmd)
+    if not execute:
+        return 0
+    if mkdir is not None:
+        mkdir.mkdir(parents=True, exist_ok=True)
+    return subprocess.run(cmd, check=False, cwd=REPO_ROOT).returncode
+
+
+def clean_stale_group_dirs(output_dir: Path, groups: Iterable[ExecutionGroup], *, execute: bool) -> None:
+    keep = {group.name for group in groups}
+    for relative in ("execution-context", "repo-overlays"):
+        parent = output_dir / relative
+        if not parent.exists():
+            continue
+        for child in parent.iterdir():
+            if not child.is_dir() or child.name in keep:
+                continue
+            if execute:
+                shutil.rmtree(child)
+                print(f"[clean] removed stale {child}")
+            else:
+                print_cmd(["rm", "-rf", str(child)])
+
+
+def sha256_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def saved_image_manifest(
+    image: str,
+    output: Path,
+    *,
+    execute: bool,
+    role: str,
+    name: str,
+) -> dict[str, Any]:
+    return {
+        "role": role,
+        "name": name,
+        "image": image,
+        "tar": str(output),
+        "sha256": sha256_file(output) if execute and output.exists() else None,
+    }
+
+
+def print_cmd(cmd: list[str]) -> None:
+    print("$ " + " ".join(shlex_quote(part) for part in cmd))
+
+
+def shlex_quote(value: str) -> str:
+    import shlex
+
+    return shlex.quote(str(value))
+
+
+def collect_mounts(infos: Iterable[StepInfo]) -> list[Any]:
+    out: list[Any] = []
+    for info in infos:
+        out.extend(info.mounts)
+    return out
+
+
+def collect_repo_overlays(infos: Iterable[StepInfo]) -> list[RepoOverlay]:
+    out: list[RepoOverlay] = []
+    for info in infos:
+        out = merge_repo_overlays(out, info.repo_overlays)
+    return out
+
+
+def repo_overlay_manifest(item: RepoOverlay) -> dict[str, str]:
+    return {
+        "repo": item.repo,
+        "url": item.url,
+        "ref": item.ref,
+        "target": item.target,
+    }
+
+
+def repo_overlay_build_manifest(item: RepoOverlay) -> dict[str, str]:
+    data = repo_overlay_manifest(item)
+    data["source"] = repo_overlay_dir_name(item)
+    return data
+
+
+def repo_overlay_dir_name(item: RepoOverlay) -> str:
+    return f"{sanitize(item.repo)}-{short_hash(repo_overlay_manifest(item))}"
+
+
+def step_to_manifest(info: StepInfo) -> dict[str, Any]:
+    return {
+        "target": info.target.spec,
+        "step_py": str(info.step_py.relative_to(REPO_ROOT)),
+        "step_toml": str(info.step_toml.relative_to(REPO_ROOT)),
+        "config": str(info.config_path.relative_to(REPO_ROOT)) if info.config_path else None,
+        "module": info.module,
+    }
+
+
+def execution_group_manifest(group: ExecutionGroup) -> dict[str, Any]:
+    return {
+        "name": group.name,
+        "image_names": sorted(group.image_names),
+        "base_image": group.base_image,
+        "platform": group.platform,
+        "tag": group.tag,
+        "selected_image": group.selected_image or group.tag,
+        "tar": str(group.tar),
+        "steps": group.steps,
+        "pip_no_deps": group.pip_no_deps,
+        "candidate_imports": sorted(group.candidate_imports),
+        "missing_imports": group.missing_imports,
+        "missing_core_imports": group.missing_core_imports,
+        "requirements": group.requirements,
+        "requirements_path": str(group.requirements_path) if group.requirements_path else None,
+        "repo_overlays": [repo_overlay_manifest(item) for item in group.repo_overlays],
+        "repo_overlays_path": str(group.repo_overlays_path) if group.repo_overlays_path else None,
+    }
+
+
+def step_execution_image_manifest(groups: Iterable[ExecutionGroup]) -> dict[str, str]:
+    out: dict[str, str] = {}
+    for group in groups:
+        image = group.selected_image or group.tag
+        for step_id in group.steps:
+            out[step_id] = image
+    return out
+
+
+def launcher_image_manifest(launcher_image: Mapping[str, Any]) -> dict[str, Any]:
+    return {
+        "base_image": launcher_image.get("base_image") or "python:3.12-slim",
+        "platform": launcher_image.get("platform"),
+        "tag": launcher_image.get("tag") or "nemotron-customizer-launcher-airgap:latest",
+        "tar": launcher_image.get("tar") or "launcher-image.tar",
+    }
+
+
+def sanitize(value: str) -> str:
+    return re.sub(r"[^a-zA-Z0-9_.-]+", "-", value).strip("-").lower() or "image"
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/docs/customize/steps/prep/index.md b/docs/customize/steps/data_prep/index.md
similarity index 51%
rename from docs/customize/steps/prep/index.md
rename to docs/customize/steps/data_prep/index.md
index e76dc0e07..08ec89553 100644
--- a/docs/customize/steps/prep/index.md
+++ b/docs/customize/steps/data_prep/index.md
@@ -1,6 +1,6 @@
 # Data Preparation
 
-```{include} ../../../../src/nemotron/steps/prep/guide.md
+```{include} ../../../../src/nemotron/steps/data_prep/guide.md
 ```
 
 ```{toctree}
diff --git a/docs/customize/steps/data_prep/sft-packing.md b/docs/customize/steps/data_prep/sft-packing.md
new file mode 100644
index 000000000..552071784
--- /dev/null
+++ b/docs/customize/steps/data_prep/sft-packing.md
@@ -0,0 +1,23 @@
+# SFT Data Packing
+
+```{step-toml} src/nemotron/steps/data_prep/sft_packing/step.toml
+```
+
+## Reference Implementation
+
+```{literalinclude} ../../../../src/nemotron/steps/data_prep/sft_packing/step.py
+:language: python
+:caption: step.py
+```
+
+## Starter Configs
+
+```{literalinclude} ../../../../src/nemotron/steps/data_prep/sft_packing/config/default.yaml
+:language: yaml
+:caption: config/default.yaml
+```
+
+```{literalinclude} ../../../../src/nemotron/steps/data_prep/sft_packing/config/tiny.yaml
+:language: yaml
+:caption: config/tiny.yaml
+```
diff --git a/docs/customize/steps/index.md b/docs/customize/steps/index.md
index 30a9078e3..8c9fe2da7 100644
--- a/docs/customize/steps/index.md
+++ b/docs/customize/steps/index.md
@@ -10,7 +10,7 @@ types
 hardware
 curate/index
 translate/index
-prep/index
+data_prep/index
 sft/index
 eval/index
 convert/index
diff --git a/docs/customize/steps/prep/sft-packing.md b/docs/customize/steps/prep/sft-packing.md
deleted file mode 100644
index b375f4686..000000000
--- a/docs/customize/steps/prep/sft-packing.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# SFT Data Packing
-
-```{step-toml} src/nemotron/steps/prep/sft_packing/step.toml
-```
-
-## Reference Implementation
-
-```{literalinclude} ../../../../src/nemotron/steps/prep/sft_packing/step.py
-:language: python
-:caption: step.py
-```
-
-## Starter Configs
-
-```{literalinclude} ../../../../src/nemotron/steps/prep/sft_packing/config/default.yaml
-:language: yaml
-:caption: config/default.yaml
-```
-
-```{literalinclude} ../../../../src/nemotron/steps/prep/sft_packing/config/tiny.yaml
-:language: yaml
-:caption: config/tiny.yaml
-```
diff --git a/docs/customize/steps/sft/megatron-bridge.md b/docs/customize/steps/sft/megatron-bridge.md
index d35822ffc..784f5f67d 100644
--- a/docs/customize/steps/sft/megatron-bridge.md
+++ b/docs/customize/steps/sft/megatron-bridge.md
@@ -12,9 +12,9 @@
 
 ## Starter Configs
 
-```{literalinclude} ../../../../src/nemotron/steps/sft/megatron_bridge/config/nano3.yaml
+```{literalinclude} ../../../../src/nemotron/steps/sft/megatron_bridge/config/default.yaml
 :language: yaml
-:caption: config/nano3.yaml
+:caption: config/default.yaml
 ```
 
 ```{literalinclude} ../../../../src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml
diff --git a/docs/customize/steps/translate/index.md b/docs/customize/steps/translate/index.md
index aaeec3de3..3a6f7a82a 100644
--- a/docs/customize/steps/translate/index.md
+++ b/docs/customize/steps/translate/index.md
@@ -3,5 +3,5 @@
 ```{toctree}
 :maxdepth: 1
 
-translation
+nemo-curator
 ```
diff --git a/docs/customize/steps/translate/translation.md b/docs/customize/steps/translate/nemo-curator.md
similarity index 54%
rename from docs/customize/steps/translate/translation.md
rename to docs/customize/steps/translate/nemo-curator.md
index 5c1a8cded..87e4317ad 100644
--- a/docs/customize/steps/translate/translation.md
+++ b/docs/customize/steps/translate/nemo-curator.md
@@ -5,7 +5,7 @@ It should stay a thin wrapper around Curator; do not generate custom chunking or
 pandas processing unless a single huge input file needs a one-off preprocessing
 stage.
 
-```{step-toml} src/nemotron/steps/translate/translation/step.toml
+```{step-toml} src/nemotron/steps/translate/nemo_curator/step.toml
 ```
 
 ## Agent Checklist
@@ -22,29 +22,49 @@ stage.
 
 ## CLI
 
-Run the step directly:
+Install the Curator-backed translation dependencies before running the step:
 
 ```bash
-nemotron steps translation \
+uv sync --extra translate
+```
+
+Run the step through the generic step dispatcher with bare ``key=value``
+overrides appended at the end of the command:
+
+```bash
+uv run --extra translate nemotron steps run translate/nemo_curator \
   input_path=/path/to/source.jsonl \
   output_dir=/path/to/translated \
   source_language=en \
   target_language=hi
 ```
 
-Use `-c` or `--config` to pass a config file or config name from the step's
-`config/` directory. The CLI currently supports local execution only.
+Use `-c` or `--config` to pass a config name from the step's `config/`
+directory or a path to a YAML file. Trailing tokens that contain ``=`` and do
+not begin with ``-`` are routed into the Hydra-style dotlist override layer.
+
+For batch executors such as Lepton or Slurm, add ``--batch <profile>``:
+
+```bash
+uv run nemotron steps run translate/nemo_curator \
+  -c default \
+  --batch lepton_translate \
+  input_path=/mnt/lustre-shared/data/source.jsonl \
+  output_dir=/mnt/lustre-shared/output/translated \
+  source_language=en \
+  target_language=hi
+```
 
 ## Reference Implementation
 
-```{literalinclude} ../../../../src/nemotron/steps/translate/translation/step.py
+```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_curator/step.py
 :language: python
 :caption: step.py
 ```
 
 ## Starter Config
 
-```{literalinclude} ../../../../src/nemotron/steps/translate/translation/config/default.yaml
+```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_curator/config/default.yaml
 :language: yaml
 :caption: config/default.yaml
 ```
diff --git a/docs/customize/steps/translate/nemo-skills.md b/docs/customize/steps/translate/nemo-skills.md
deleted file mode 100644
index a78f55d7f..000000000
--- a/docs/customize/steps/translate/nemo-skills.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Translation + FAITH Scoring (NeMo Skills)
-
-```{step-toml} src/nemotron/steps/translate/nemo_skills/step.toml
-```
-
-## Reference Implementation
-
-```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_skills/step.py
-:language: python
-:caption: step.py
-```
-
-## Starter Config
-
-```{literalinclude} ../../../../src/nemotron/steps/translate/nemo_skills/config/default.yaml
-:language: yaml
-:caption: config/default.yaml
-```
diff --git a/pyproject.toml b/pyproject.toml
index bb45a94b0..d5895a869 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "nemotron"
 version = "0.1.0"
 description = "Reproducible training recipes for NVIDIA Nemotron model family - transparent pipelines for data preparation, training, and evaluation across all stages"
-requires-python = ">=3.10"
+requires-python = ">=3.10,<3.14"
 license = {text = "MIT"}
 authors = [
     {name = "Nemotron Contributors"}
@@ -33,7 +33,7 @@ dependencies = [
     "numpy>=1.24.0",
     "pyarrow>=14.0.0",
     "xxhash>=3.4.0",
-    "transformers>=4.36.0",
+    "transformers>=4.57.6,<5.0",
     "huggingface_hub>=0.20.0",
     "datasets>=2.14.0",  # Required for ray.data.from_huggingface
     "pyyaml>=6.0",
@@ -64,15 +64,37 @@ audio = [
 # `uv run --no-project` (they declare their own PEP 723 inline deps).
 data-sdg = ["data-designer==0.5.5; python_version>='3.11' and python_version<'3.14'"]
 byob = [
-    # BYOB uses Curator's current translation and semantic dedup stack, which is Python 3.11+.
+    # BYOB CPU/runtime dependencies. GPU semantic-dedup/outlier dependencies live in `byob-gpu`.
+    "cosmos-xenna>=0.2,<0.3; python_version>='3.11' and python_version<'3.14'",
     "data-designer==0.5.5; python_version>='3.11' and python_version<'3.14'",
-    "nemo-curator @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'",
+    "nemo-curator[translation_all] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'",
+    "datasets>=2.14.0; python_version>='3.11'",
+    "numpy>=2.2,<3; python_version>='3.11'",
     "pandas>=2.1.0; python_version>='3.11'",
-    "sentence-transformers>=5.0.0,<6.0.0; python_version>='3.11'",
-    "scikit-learn>=1.7.0,<1.8.0; python_version>='3.11'",
+    "pyarrow>=14.0.0; python_version>='3.11'",
+    "pyyaml>=6.0; python_version>='3.11'",
+    "pydantic>=2.0.0; python_version>='3.11'",
+    "requests>=2.0.0; python_version>='3.11'",
+    "tqdm; python_version>='3.11'",
+    "urllib3>=2.7.0,<3; python_version>='3.11'",
+    "obstore>=0.8,<0.9; python_version>='3.11'",
+    "portpicker>=1.6,<2; python_version>='3.11'",
+    "pulp>=3.3,<4; python_version>='3.11'",
+    "attrs>=25.4,<26; python_version>='3.11'",
+    "cattrs>=25.3,<26; python_version>='3.11'",
+    "jinja2>=3.1,<4; python_version>='3.11'",
+    "loguru>=0.7,<1; python_version>='3.11'",
+    "tabulate>=0.9,<1; python_version>='3.11'",
     "sacrebleu>=2.6.0,<3.0.0; python_version>='3.11'",
     "iso639-lang>=2.6.0,<3.0.0; python_version>='3.11'",
     "bcp47>=0.1.0; python_version>='3.11' and python_version<'3.14'",
+]
+byob-gpu = [
+    "torch>=2.10,<2.11; python_version>='3.11'",
+    "transformers>=4.57.6,<5.0; python_version>='3.11'",
+    "sentence-transformers>=5.0.0,<6.0.0; python_version>='3.11'",
+    "scikit-learn>=1.7.0,<1.8.0; python_version>='3.11'",
+    "cupy-cuda12x>=14.0,<15; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
     "cuda-bindings>=12.9,<13; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
     "cuda-python>=12.9,<13; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
     "cudf-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
@@ -82,8 +104,29 @@ byob = [
     "raft-dask-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
     "rapidsmpf-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
 ]
+translate = [
+    "nemo-curator[translation_all] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'",
+    "pyarrow>=14.0.0; python_version>='3.11'",
+    "pyyaml>=6.0; python_version>='3.11'",
+    "obstore>=0.8,<0.9; python_version>='3.11'",
+    "portpicker>=1.6,<2; python_version>='3.11'",
+    "pulp>=3.3,<4; python_version>='3.11'",
+    "attrs>=25.4,<26; python_version>='3.11'",
+    "cattrs>=25.3,<26; python_version>='3.11'",
+    "jinja2>=3.1,<4; python_version>='3.11'",
+    "loguru>=0.7,<1; python_version>='3.11'",
+    "tabulate>=0.9,<1; python_version>='3.11'",
+    "sacrebleu>=2.6.0,<3.0.0; python_version>='3.11'",
+    "bcp47>=0.1.0; python_version>='3.11' and python_version<'3.14'",
+]
+evaluator = ["nemo-evaluator-launcher>=0.1.0"]
+curate = [
+    "nemo-curator @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'",
+    "huggingface_hub>=0.20.0; python_version>='3.11'",
+    "pyyaml>=6.0; python_version>='3.11'",
+]
 dev = [
-    "pytest>=7.0.0",
+    "pytest>=9.0.3",
     "pytest-cov>=4.0.0",
     "mypy>=1.0.0",
     "ruff>=0.1.0",
@@ -97,7 +140,7 @@ all = [
     "webdataset>=0.2.86",
     "imageio-ffmpeg>=0.5.1",
     "data-designer==0.5.5; python_version>='3.11' and python_version<'3.14'",
-    "nemo-curator @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'",
+    "nemo-curator[translation_all] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main ; python_version>='3.11' and python_version<'3.14'",
     "pandas>=2.1.0; python_version>='3.11'",
     "sentence-transformers>=5.0.0,<6.0.0; python_version>='3.11'",
     "scikit-learn>=1.7.0,<1.8.0; python_version>='3.11'",
@@ -112,6 +155,7 @@ all = [
     "pylibraft-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
     "raft-dask-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
     "rapidsmpf-cu12==25.10.*; python_version>='3.11' and platform_system=='Linux' and platform_machine=='x86_64'",
+    "nemo-evaluator-launcher>=0.1.0",
 ]
 
 # Note: megatron-bridge is required for training but not listed as a dependency
@@ -173,18 +217,116 @@ package = true
 constraint-dependencies = [
     # Curator main currently keeps these constraints in its own uv config, but
     # they are not published transitively through package metadata.
-    "transformers>=4.56.0,<5.0",
+    "transformers>=4.57.6,<5.0",
+    "python-multipart>=0.0.29",
+    "cryptography>=48.0.0",
+    "gitpython>=3.1.50",
+    "pytest>=9.0.3",
 ]
 override-dependencies = [
     # data-designer-engine==0.5.5 declares huggingface-hub>=1.0.1, while
     # Curator's Transformers-compatible stack requires the pre-1.0 API.
     "huggingface-hub>=0.34,<1.0",
     "torch==2.10.0",
+    # torchx==0.7.0 still caps urllib3<1.27 through nemo-run. Force the
+    # patched urllib3 line while we validate torchx/nemo-run compatibility.
+    "urllib3>=2.7.0,<3",
+]
+
+[tool.nemotron.runtime.byob]
+extras = ["byob"]
+venv-name = "byob"
+extra-index-urls = ["https://pypi.nvidia.com"]
+omit-packages = ["nemo-curator"]
+required-imports = [
+    "bcp47",
+    "cattrs",
+    "data_designer",
+    "datasets",
+    "jinja2",
+    "loguru",
+    "nemo_curator",
+    "numpy",
+    "obstore",
+    "pandas",
+    "portpicker",
+    "pulp",
+    "pyarrow",
+    "pydantic",
+    "requests",
+    "sacrebleu",
+    "tabulate",
+    "tqdm",
+    "yaml",
+]
+
+[tool.nemotron.runtime.byob-gpu]
+extras = ["byob", "byob-gpu"]
+venv-name = "byob-gpu"
+extra-index-urls = ["https://pypi.nvidia.com"]
+torch-backend = "cu128"
+omit-packages = ["nemo-curator"]
+required-imports = [
+    "bcp47",
+    "cattrs",
+    "data_designer",
+    "datasets",
+    "jinja2",
+    "loguru",
+    "nemo_curator",
+    "numpy",
+    "obstore",
+    "pandas",
+    "portpicker",
+    "pulp",
+    "pyarrow",
+    "pydantic",
+    "requests",
+    "sacrebleu",
+    "sentence_transformers",
+    "sklearn",
+    "tabulate",
+    "tqdm",
+    "yaml",
+]
+# RAPIDS imports can initialize CUDA libraries. Checking import specs verifies
+# wheels are present without loading CUDA libraries during bootstrap readiness.
+spec-only-imports = ["cudf", "cuml", "cupy"]
+
+[tool.nemotron.runtime.translate]
+extras = ["translate"]
+venv-name = "translate"
+extra-index-urls = ["https://pypi.nvidia.com"]
+omit-packages = ["nemo-curator"]
+required-imports = [
+    "bcp47",
+    "cattrs",
+    "jinja2",
+    "loguru",
+    "nemo_curator",
+    "obstore",
+    "portpicker",
+    "pulp",
+    "pyarrow",
+    "sacrebleu",
+    "tabulate",
+    "yaml",
+]
+
+[tool.nemotron.runtime.curate]
+extras = ["curate"]
+venv-name = "curate"
+extra-index-urls = ["https://pypi.nvidia.com"]
+omit-packages = ["nemo-curator"]
+required-imports = [
+    "huggingface_hub",
+    "nemo_curator",
+    "yaml",
 ]
 
 [dependency-groups]
 dev = [
-    "pytest>=9.0.2",
+    "pytest>=9.0.3",
 ]
 run = [
     "nemo-run @ git+https://github.com/NVIDIA-NeMo/Run.git@main",
diff --git a/skills/INDEX.md b/skills/INDEX.md
index cbbab7320..7b1675a11 100644
--- a/skills/INDEX.md
+++ b/skills/INDEX.md
@@ -20,12 +20,13 @@ its own `SKILL.md` (frontmatter + body) and lives in a sibling directory.
 skills/                            ← workflow & reference skills (this directory)
 └── nemotron-customize/            ← e.g. pipeline-builder skill
     ├── SKILL.md                   ← agent entry point (Orient/Plan/Act/Verify)
-    ├── act/                       ← codegen rules loaded during Act phase
-    │   ├── PROJECT.md             ← project-scaffold rules (R1–R10)
-    │   └── STAGE.md               ← per-stage rules (R1–R5, dry-run, W&B)
-    └── context/                   ← authored library API extracts for codegen
-        ├── index.toml             ← (step_id, intent) → pack file
-        └── README.md              ← provenance + nv-base notes
+    ├── references/
+    │   ├── act/                   ← codegen rules loaded during Act phase
+    │   │   ├── PROJECT.md         ← project-scaffold rules (R1–R10)
+    │   │   └── STAGE.md           ← per-stage rules (R1–R5, dry-run, W&B)
+    │   └── context/               ← authored library API extracts for codegen
+    │       ├── index.toml         ← (step_id, intent) → pack file
+    │       └── README.md          ← provenance notes
 
 src/nemotron/steps/                ← step library (the catalog skills route into)
 ├── SKILL.md                       ← per-category routing
@@ -37,7 +38,7 @@ src/nemotron/steps/                ← step library (the catalog skills route in
     ├── step.toml                  ← machine contract (consumes/produces/params/strategies/errors)
     ├── SKILL.md                   ← agent prose: when/why/gotchas (per-step)
     ├── step.py                    ← runspec + entry point
-    └── config/                    ← default.yaml + tiny.yaml
+    └── config/                    ← one or more named configs
 ```
 
 **Rule of thumb:**
@@ -48,12 +49,6 @@ src/nemotron/steps/                ← step library (the catalog skills route in
 
 ## Validation
 
-This directory is validated by [nv-base](https://gitlab-master.nvidia.com/ai_tools/nvcarps_team/nv-base):
-
-```bash
-nv-base validate skills/ --type skill --no-llm -r cli json -o reports/nv-base -c
-```
-
 Every `SKILL.md` requires a YAML frontmatter block:
 
 ```markdown
@@ -63,9 +58,8 @@ description: <one-line "when to use" hook>
 ---
 ```
 
-The 17 files under `nemotron-customize/context/*.txt` are extracted upstream
-documentation from the Nemotron-stack libraries (Megatron-Bridge, AutoModel,
-Curator, NeMo-RL, Speaker, Evaluator, ModelOpt, Data Designer). They contain
-code snippets that nv-base flags for `Env Variable Harvesting`, `Credential
-Access`, etc. — these are **documentation false positives**, not executable
-code paths in this repo.
+The files under `nemotron-customize/references/context/*.txt` are short
+curated context packs for the Nemotron-stack libraries (Megatron-Bridge,
+AutoModel, Curator, NeMo-RL, Evaluator, ModelOpt, Data Designer). They are
+read-only reference material for grounding agent changes in the real library
+APIs, not runtime code paths.
diff --git a/skills/nemotron-customize/.claude-plugin/plugin.json b/skills/nemotron-customize/.claude-plugin/plugin.json
new file mode 100644
index 000000000..a7e4aa56b
--- /dev/null
+++ b/skills/nemotron-customize/.claude-plugin/plugin.json
@@ -0,0 +1,10 @@
+{
+  "name": "nemotron-customize",
+  "description": "Compose runnable NVIDIA Nemotron model-customization pipelines from existing repo steps.",
+  "version": "0.1.0",
+  "author": {
+    "name": "NVIDIA Nemotron Team"
+  },
+  "homepage": "https://github.com/NVIDIA/Nemotron",
+  "skills": ["./"]
+}
diff --git a/skills/nemotron-customize/SKILL.md b/skills/nemotron-customize/SKILL.md
index 94090e0e3..d0e644e35 100644
--- a/skills/nemotron-customize/SKILL.md
+++ b/skills/nemotron-customize/SKILL.md
@@ -1,33 +1,65 @@
 ---
 name: nemotron-customize
-description: Compose runnable training pipelines from steps under src/nemotron/steps/. Plans a stage DAG, validates artifact wiring against types.toml, fires patterns, then generates a forkable Python project. Use when the user wants to fine-tune, pretrain, align, evaluate, or optimize a Nemotron-stack model end-to-end.
+description: Use when building runnable Nemotron model-customization pipelines from existing repo steps and artifact contracts.
+version: 0.1.0
+metadata:
+  author: NVIDIA Nemotron Team
+  tags:
+    - nemotron
+    - customization
+    - training
+    - pipelines
 ---
 
 # nemotron-customize
 
-Invocation: `/nemotron-customize`.
+## Purpose
 
-You compose **steps** from [src/nemotron/steps/](../../src/nemotron/steps/)
-into a runnable Python project the user owns. **The step library is the
-source of truth.** This skill orchestrates — it does not duplicate per-step
-knowledge.
+Use this skill to turn a model-customization request into a repo-native Nemotron step pipeline. It plans the step DAG, validates artifact wiring, and creates only the YAML configs needed to run existing steps.
 
-When you need to know what a step does, read its `step.toml` and `SKILL.md`.
-When you need to know whether a chain is sound, read the patterns it cites.
-When you need to write code for a stage, read `step.py` + the runner +
-(if mapped in [context/index.toml](context/index.toml)) the context pack.
+Use it only for inspecting, configuring, validating, running, or submitting
+existing Nemotron steps or multi-step training/customization pipelines. If the
+request is a frontend, dashboard, visualization, generic ML-advice,
+billing/access, or unrelated coding task, stop with a short scope note and do
+not inspect the step catalog or edit files in that turn.
 
-## Tone
+## Requirements
 
-Concise. Technical. No fluff.
+- A checkout of this Nemotron repo with `src/nemotron/steps/` available.
+- **Invoke from the repo root.** All file paths in this document are repo-root-relative (e.g. `src/nemotron/steps/STEPS.md`, `skills/nemotron-customize/references/act/STAGE.md`). Resolve them against the user's current working directory, which must be the Nemotron checkout root.
+- User-provided model, data, hardware, backend, and output constraints before writing configs.
+- Backend credentials only when the selected step requires them, such as translation or W&B-enabled training.
 
-- Status updates: ≤2 lines.
-- Plan commentary: one sentence per stage, max.
-- Decision explanations: tables over paragraphs.
-- Never start with "Great", "Sure", "Certainly", "Of course".
-- No emojis unless the user uses them first.
+## Limitations
 
----
+- This skill does not invent new catalog steps when an existing step can satisfy the request.
+- New Python or shell code is allowed only in Explorer mode after the repo capability gap is explicit.
+- Post-training deployment-only requests are out of scope unless they are part of a model-customization pipeline.
+
+Invocation: `/nemotron-customize`.
+
+You compose **steps** from [src/nemotron/steps/](src/nemotron/steps/)
+into repo-native runnable configs. **The current codebase is the source of
+truth.** This skill orchestrates — it does not duplicate per-step knowledge.
+
+Priority order:
+
+1. Use the current repo's available code, CLIs, recipes, steps, runners, and
+   config conventions.
+2. Create only new YAML config files needed to serve the user's request.
+3. Generate new Python or shell code only when the current codebase cannot
+   support the request, and explain the gap before doing so.
+
+When you need to know what a step does, read its `step.toml` and `SKILL.md`.
+When you need to know whether a chain is sound, read the patterns it cites.
+When you need to configure a stage, read `step.py` + the runner + existing
+configs to learn the supported YAML shape. Read context packs only if new code
+is unavoidable.
+
+For a command request, the fast path is: verify repo root, run or read the step
+catalog, read the selected `step.toml`, verify the requested config exists,
+read the active env TOML for any remote profile, then emit the complete command.
+Do not guess `--batch` profiles from examples or naming conventions.
 
 ## How information is split (and where to find it)
 
@@ -36,19 +68,73 @@ Concise. Technical. No fluff.
 | What does step X consume / produce / parameterize? | `src/nemotron/steps/<cat>/<X>/step.toml` |
 | When/why pick step X over its siblings? | `src/nemotron/steps/<cat>/<X>/SKILL.md` |
 | Which step in category C should I pick? | `src/nemotron/steps/<cat>/SKILL.md` |
-| What runner code does step X use? | `src/nemotron/steps/<cat>/<X>/step.py` → [_runners/](../../src/nemotron/steps/_runners/) |
-| Cross-step constraint (tokenizer lock, eval bookends, ...) | `src/nemotron/steps/patterns/<id>.md` |
-| Artifact compatibility / `is_a` / `convert_to` | [src/nemotron/steps/types.toml](../../src/nemotron/steps/types.toml) |
-| GPU memory / parallelism heuristics | [src/nemotron/steps/hardware.md](../../src/nemotron/steps/hardware.md) |
-| Library API extracts for code generation | [context/index.toml](context/index.toml) → `context/<pack>.txt` |
-| Project scaffold rules (CLI, pyproject, README, deploy) | [act/PROJECT.md](act/PROJECT.md) |
-| Per-stage code rules (R1–R5, dry-run, W&B) | [act/STAGE.md](act/STAGE.md) |
+| What runner code does step X use? | `src/nemotron/steps/<cat>/<X>/step.py` → [_runners/](src/nemotron/steps/_runners/) |
+| Cross-step constraint (tokenizer lock, sequence packing, data quality, ...) | `src/nemotron/steps/patterns/<id>.md` |
+| Artifact compatibility / `is_a` hierarchy | [src/nemotron/steps/types.toml](src/nemotron/steps/types.toml) |
+| GPU memory / parallelism heuristics | [src/nemotron/steps/hardware.md](src/nemotron/steps/hardware.md) |
+| Library API extracts for exceptional code generation | [references/context/index.toml](skills/nemotron-customize/references/context/index.toml) → `references/context/<pack>.txt` |
+| Project scaffold rules, only when repo code cannot support the request | [references/act/PROJECT.md](skills/nemotron-customize/references/act/PROJECT.md) |
+| Per-stage code rules, only when repo code cannot support the request | [references/act/STAGE.md](skills/nemotron-customize/references/act/STAGE.md) |
 
 If two sources say the same thing, the **deeper, more specific** one wins
 (`step.toml` > category `SKILL.md` > this file).
 
 ---
 
+## Instructions
+
+Use this skill when the user asks for an end-to-end Nemotron-stack pipeline:
+fine-tuning, continued pretraining, alignment training, data curation,
+translation for training data, or other data preprocessing for model training.
+Follow the workflow below in order:
+
+1. **Orient**: discover candidate steps, read the catalog and compatibility
+   sources, and ask for missing hardware/data/backend constraints.
+2. **Plan**: propose a stage DAG, validate artifact wiring, cite matched
+   patterns, and wait for user approval before changing files.
+3. **Act**: create the minimal YAML configs for the selected repo steps.
+   Generate code only if no current repo path can satisfy the request.
+4. **Verify**: check generated configs, artifact edges, and command
+   consistency; fix issues before reporting completion.
+
+Do not treat this skill as general ML advice. The step library under
+[src/nemotron/steps/](src/nemotron/steps/) is the source of truth.
+
+For single-step command questions, use this shorter flow instead of the full
+pipeline workflow:
+
+1. Confirm the repo root has `pyproject.toml` and `src/nemotron/steps/`.
+2. Run `uv run nemotron steps list --json` when available; otherwise read
+   [STEPS.md](src/nemotron/steps/STEPS.md).
+3. Read the selected step's `step.toml` and the requested checked-in config.
+4. For remote execution, read `NEMOTRON_ENV_FILE` or a repo-root `env*.toml`
+   and choose an actual section name whose profile matches the step.
+5. Return the command first, followed by only the rationale needed to explain
+   config/profile choices.
+
+For translation-only command requests, also read
+[src/nemotron/steps/translate/SKILL.md](src/nemotron/steps/translate/SKILL.md)
+and return `Decision`, `Config`, `Run`, `Output`, and `Env`. Do not continue
+broad repository exploration once those fields are execution-ready.
+
+Source tiers for command answers:
+
+- **Verified**: CLI, manifest, config, env profile, and dry-run all succeeded.
+- **Repo-grounded**: manifest, config, and env profile were read, but dry-run
+  could not be executed.
+- **Blocked**: a required repo file or env TOML is missing; name it and stop
+  before emitting a guessed remote command.
+
+Canonical commands:
+
+```bash
+uv run nemotron steps run <step_id> -c <config-or-path> --dry-run
+uv run nemotron steps run <step_id> -c <config-or-path> --dry-run --batch <profile>
+uv run nemotron steps run <step_id> -c <config-or-path> --batch <profile>
+```
+
+---
+
 ## Workflow
 
 Four phases, in order: **Orient → Plan → Act → Verify.** Never skip Verify.
@@ -63,35 +149,27 @@ Goal: enumerate candidate steps and gather the user's constraints in one pass.
 machine-readable:
 
 ```bash
-nemotron step list --json                                  # all steps
-nemotron step list --json --category sft                   # by category
-nemotron step list --json --consumes training_jsonl        # by input type
-nemotron step list --json --produces checkpoint_megatron   # by output type
-nemotron step show <step_id>                               # full manifest
+nemotron steps list --json                                 # all steps
+nemotron steps list --json --category sft                  # by category
+nemotron steps list --json --consumes training_jsonl       # by input type
+nemotron steps list --json --produces checkpoint_megatron  # by output type
+nemotron steps show <step_id>                              # full manifest
 ```
 
-Implementation: [list_cmd.py](../../src/nemotron/cli/commands/step/list_cmd.py),
-[show_cmd.py](../../src/nemotron/cli/commands/step/show_cmd.py),
-[run_cmd.py](../../src/nemotron/cli/commands/step/run_cmd.py).
-
-Per-step JSON schema: `{id, name, category, description, tags, path,
-consumes:[{type,required,description}], produces:[...], parameters:[...]}`.
-
 **Step 1.2 — Read these in parallel** (small files, all cheap):
 
-- [src/nemotron/steps/STEPS.md](../../src/nemotron/steps/STEPS.md) — auto-generated catalog (always read first).
-- [src/nemotron/steps/PATTERNS.md](../../src/nemotron/steps/PATTERNS.md) — auto-generated pattern index.
-- [src/nemotron/steps/types.toml](../../src/nemotron/steps/types.toml) — artifact compatibility graph (`is_a`, `convert_to`).
-- [src/nemotron/steps/hardware.md](../../src/nemotron/steps/hardware.md) — GPU heuristics if hardware is in scope.
+- [src/nemotron/steps/STEPS.md](src/nemotron/steps/STEPS.md) — auto-generated catalog (always read first).
+- [src/nemotron/steps/PATTERNS.md](src/nemotron/steps/PATTERNS.md) — auto-generated pattern index.
+- [src/nemotron/steps/types.toml](src/nemotron/steps/types.toml) — artifact compatibility graph (`is_a` hierarchy).
+- [src/nemotron/steps/hardware.md](src/nemotron/steps/hardware.md) — GPU heuristics if hardware is in scope.
 
 **Step 1.3 — For each candidate category, descend one level**:
 
 - `src/nemotron/steps/<cat>/SKILL.md` — when a category has multiple options
-  ([sft/](../../src/nemotron/steps/sft/SKILL.md),
-  [pretrain/](../../src/nemotron/steps/pretrain/SKILL.md),
-  [peft/](../../src/nemotron/steps/peft/SKILL.md),
-  [rl/nemo_rl/](../../src/nemotron/steps/rl/nemo_rl/SKILL.md),
-  [optimize/modelopt/](../../src/nemotron/steps/optimize/modelopt/SKILL.md)).
+  ([sft/](src/nemotron/steps/sft/SKILL.md),
+  [pretrain/](src/nemotron/steps/pretrain/SKILL.md),
+  [peft/](src/nemotron/steps/peft/SKILL.md),
+  [rl/nemo_rl/](src/nemotron/steps/rl/nemo_rl/SKILL.md)).
 
 **Step 1.4 — For each candidate step, read its `step.toml`** end-to-end.
 You're after: `[[consumes]]`, `[[produces]]`, `[[parameters]]`,
@@ -109,9 +187,8 @@ Present as a numbered list, replies as numbers or Enter for `[defaults]`:
 3. Data size (rough): \_\_\_ examples
 4. GPUs: count + type + nodes (e.g. `8x H100, 1 node`)
 5. Backend preference: `[nemo-run]` / plain Python
-6. Deploy: `[local only]` / Airflow / Kubeflow
-7. W&B: `[off]` / on (project name?)
-8. Output: `[./<project-name>/]` / current dir
+6. W&B: `[off]` / on (project name?)
+7. Output: `[./<project-name>/]` / current dir
 
 **Never assume hardware, data availability, or framework. Ask.**
 
@@ -132,107 +209,68 @@ Goal: produce a markdown plan the user reviews before any code is written.
 - Strategies fired (the `when:` clauses from `step.toml` that match).
 - Patterns cited (from `src/nemotron/steps/patterns/`).
 
-**Step 2.3 — Run preflight validation.** Each item is a hard check:
-
-| # | Check | Source of truth |
-|---|---|---|
-| 1 | Every `consumes.type` matches an upstream `produces.type` (direct or via `is_a`). | [types.toml](../../src/nemotron/steps/types.toml) |
-| 2 | If a chain breaks, insert the right converter step. | `convert_to` in [types.toml](../../src/nemotron/steps/types.toml) → [convert/megatron_to_hf](../../src/nemotron/steps/convert/megatron_to_hf/), [convert/hf_to_megatron](../../src/nemotron/steps/convert/hf_to_megatron/), [convert/merge_lora](../../src/nemotron/steps/convert/merge_lora/) |
-| 3 | Tokenizer + chat template + seq_length consistent across prep ↔ train ↔ RL ↔ eval. | [patterns/prep-data-is-tokenizer-locked.md](../../src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md), [patterns/sft-sequence-packing.md](../../src/nemotron/steps/patterns/sft-sequence-packing.md) |
-| 4 | LoRA outputs are merged before eval/RL. | [patterns/peft-adapter-merge-discipline.md](../../src/nemotron/steps/patterns/peft-adapter-merge-discipline.md) |
-| 5 | Eval bookends present (before + after training). | [patterns/eval-before-and-after-training.md](../../src/nemotron/steps/patterns/eval-before-and-after-training.md) |
-| 6 | RL warm-starts from SFT; rewards validated before scale. | [patterns/rl-validate-rewards-before-scale.md](../../src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md) |
-| 7 | GPU count ≥ chosen model's `min_gpus` (from `[[models]]` block in each `step.toml`). | step.toml + [hardware.md](../../src/nemotron/steps/hardware.md) |
-| 8 | Sovereign / customization patterns checked: `cpt-data-blend-scoping`, `sft-data-blending`, `multilingual-tokenizer-check`, `data-quality-before-quantity`, `sdg-pipeline-versioning`, `byob-benchmark-design`, `pretrain-token-budget-before-scale`, `sft-small-dataset-prefer-lora`, `convert-checkpoint-safety`. | [patterns/](../../src/nemotron/steps/patterns/) |
-
-When a check fails: surface it as a `⚠` warning in the plan and propose a
+**Step 2.3 — Run preflight validation.** Hard checks: artifact types chain via [types.toml](src/nemotron/steps/types.toml); tokenizer/template/sequence length align across prep and train; RL warm-starts from SFT; GPU count satisfies the selected model; applicable patterns are cited. When a check fails: surface it as a `WARNING:` warning in the plan and propose a
 fix. When the user can't satisfy it (e.g. hardware), propose alternatives in
 descending preference: smaller model → AutoModel instead of Megatron-Bridge →
 LoRA instead of full FT.
 
-**Step 2.4 — Plan format:**
-
-````markdown
-# Pipeline Plan: <project-name>
-
-## Intent
-<One sentence: what we're building and why.>
-
-## Stages
-```mermaid
-graph LR
-    A[01_curate] -->|filtered_jsonl| B[02_prep]
-    B -->|packed_parquet| C[03_sft]
-    C -->|checkpoint_megatron| D[04_eval]
-```
-
-### 1. <category>/<step_id>
-- Consumes: <type> from <stage NN | user>
-- Produces: <type>
-- Key params: <2–3 from step.toml>
-- Strategies fired: <when-clauses that match>
-- Patterns cited: <pattern_id, pattern_id>
-
-<repeat per stage>
-
-## Validation (preflight)
-✓ Artifact chain
-✓ Tokenizer / template / seq_length consistency
-✓ Eval bookends present
-✓ GPU count ≥ min_gpus
-✓ All applicable patterns acknowledged
-⚠ <warnings — missing data, hardware risk, pattern violation, etc.>
-
-## Infrastructure
-| Resource | Required by | Notes |
-|---|---|---|
-| <resource> | <stage> | <status / question> |
-````
+**Step 2.4 — Plan format.** Include `Intent`, `Stages`, `Validation`, and `Infrastructure`. Use a Mermaid graph for artifact flow, one short stage block per step, and explicit `PASS:` / `WARNING:` validation lines.
 
 **Step 2.5 — Present the plan and wait.** Don't proceed to Act until the
-user approves or requests changes.
+user approves or requests changes. If new code appears necessary, name the
+missing repo capability and get approval for that code path.
 
 ---
 
 ### Phase 3 — Act
 
-Goal: produce a complete, runnable Python project. No placeholders. No TODOs.
+Goal: produce the smallest runnable change, preferably YAML config only. No
+placeholders. No TODOs.
 
-**Step 3.1 — Load codegen rules.**
+**Step 3.1 — Prefer the existing repo execution path.**
 
-- Main agent reads [act/PROJECT.md](act/PROJECT.md) (project scaffold rules).
-- Each per-stage sub-agent reads [act/STAGE.md](act/STAGE.md) (R1–R5 +
-  code-quality + dry-run + W&B).
+Before creating any code, identify how the existing repo can run each stage:
+
+- CLI commands under [src/nemotron/cli/](src/nemotron/cli/).
+- Step entrypoints in `src/nemotron/steps/<cat>/<step>/step.py`.
+- Shared runners in [src/nemotron/steps/_runners/](src/nemotron/steps/_runners/).
+- Existing configs under the selected step, recipe, or runner directory.
 
-**Step 3.2 — Main agent generates the scaffold:**
+**Step 3.2 — Generate only YAML configs when the repo supports the request.**
 
 ```
 <project-name>/
-├── pyproject.toml
-├── .python-version              # "3.12"
-├── README.md                    # with mermaid + stage table
-├── env.toml.example
-├── <project_name>/
-│   ├── __init__.py
-│   ├── __main__.py              # `from .cli import app; app()`
-│   ├── cli.py                   # Typer; one cmd per stage + `all`
-│   └── stages/                  # populated by sub-agents
-└── .generated/
-    ├── pipeline.toml            # canonical stage graph
-    ├── SKILL.md                 # invocable as /<project-name> (with frontmatter)
-    └── plugin.json              # .claude-plugin manifest
+├── configs/
+│   └── <stage-name>.yaml        # user-specific config for an existing step
+└── README.md                    # optional: only if the user asks for run docs
 ```
 
-Naming: `<project-name>` is kebab-case (skill invocation, DAG name);
-`<project_name>` is snake_case (Python identifier).
+Naming: `<project-name>` is kebab-case. YAML filenames should match approved
+stage names.
+
+Each YAML config must:
+
+- Match keys read by the existing `step.py` and runner code.
+- Adapt existing default/tiny configs instead of inventing a schema.
+- Use user-provided paths, model IDs, hardware, backend, and W&B settings.
+- Preserve artifact compatibility from the approved plan.
+
+**Step 3.3 — Only use codegen when YAML cannot satisfy the request.**
 
-**Step 3.3 — For each stage, spawn one sub-agent in parallel:**
+If the repo lacks a callable step, runner, CLI, or config surface for the
+requested behavior, load codegen rules:
+
+- Main agent reads [references/act/PROJECT.md](skills/nemotron-customize/references/act/PROJECT.md) (project scaffold rules).
+- Each per-stage sub-agent reads [references/act/STAGE.md](skills/nemotron-customize/references/act/STAGE.md) (R1–R5 +
+  code-quality + dry-run + W&B).
+
+Then implement the missing stage with the narrowest possible code change:
 
 ```
 You are implementing stage <NN>_<name> = <step_id>.
 
 Load:
-  - skills/nemotron-customize/act/STAGE.md
+  - skills/nemotron-customize/references/act/STAGE.md
   - <context_pack_path>                       # from context/index.toml; OPTIONAL — skip if not mapped
   - src/nemotron/steps/<cat>/<step>/step.py   # primary code shape
   - src/nemotron/steps/_runners/<runner>.py   # if step.py imports a shared runner
@@ -248,86 +286,59 @@ Deliverables (exactly these):
   - run.py
   - __init__.py
   - config/default.yaml
-  - config/tiny.yaml
+  - config/tiny.yaml, or the step's checked-in smoke config name such as config/tiny_chat.yaml for eval/model_eval
 
 Report back: files written, knobs exposed, UPSTREAM notes, strategies followed.
 ```
 
-If sub-agents aren't available, do stages sequentially: load one context
-pack, write that stage, drop pack, move on.
+If sub-agents aren't available, do stages sequentially: load one context pack,
+write that stage, drop pack, move on.
 
-**Step 3.4 — Step.py + the runner are the reference.** Don't invent library
-APIs from memory. Mirror what the in-repo code does:
+**Step 3.4 — Step.py + the runner are the reference.** Don't invent YAML keys
+or library APIs from memory. Mirror what the in-repo code does:
 
-- [steps/_runners/megatron_bridge.py](../../src/nemotron/steps/_runners/megatron_bridge.py) — used by sft/peft/pretrain Megatron-Bridge steps.
-- [steps/_runners/automodel.py](../../src/nemotron/steps/_runners/automodel.py) — used by AutoModel steps.
-- [steps/_runners/nemo_rl.py](../../src/nemotron/steps/_runners/nemo_rl.py) — used by all NeMo-RL alignment steps.
-- [steps/_runners/modelopt.py](../../src/nemotron/steps/_runners/modelopt.py) — used by quantize/prune/distill.
+- [steps/_runners/megatron_bridge.py](src/nemotron/steps/_runners/megatron_bridge.py) — used by sft/peft/pretrain Megatron-Bridge steps.
+- [steps/_runners/automodel.py](src/nemotron/steps/_runners/automodel.py) — used by AutoModel steps.
+- [steps/_runners/nemo_rl.py](src/nemotron/steps/_runners/nemo_rl.py) — used by all NeMo-RL alignment steps.
 
-For steps without a context pack (`sft/megatron_bridge`, `eval/model_eval`,
-`curate/nemo_curator`, `translate/nemo_skills`, `convert/*`), the agent
-combines: per-step `SKILL.md` + `step.toml [[strategies]]` + `step.py` + the
-URLs in `[reference]`. That's enough.
+When a step has no context pack, the agent combines: per-step `SKILL.md` + `step.toml [[strategies]]` + `step.py` + the URLs in `[reference]`. That is enough.
 
 ---
 
 ### Phase 4 — Verify
 
-Goal: every preflight check holds against the *generated files*, not just
-the plan.
+Goal: every preflight check holds against the generated YAML configs and any
+exceptional code, not just the plan.
 
 Run through:
 
-- [ ] Every stage script has valid Python syntax (no placeholder functions).
-- [ ] Every import references a real module from the step's reference code.
-- [ ] Every `config/*.yaml` is valid; keys match what `run.py` reads.
-- [ ] `.generated/pipeline.toml` matches the generated `stages/` dirs.
+- [ ] Every generated `*.yaml` is valid; keys match the existing step/runner code.
 - [ ] Artifact wiring is consistent (stage N output type = stage N+1 input type).
-- [ ] `pyproject.toml` covers every imported third-party package.
-- [ ] `README.md` mermaid matches the actual stages.
-- [ ] `tiny.yaml` configs use reduced iters, batch sizes, max_steps.
-- [ ] Tokenizer + seq_length aligned across prep ↔ train ↔ eval YAMLs.
-- [ ] No `${art:...}` references leaked into generated configs (those belong only in [src/nemotron/recipes/](../../src/nemotron/recipes/)).
+- [ ] Existing CLI or runner commands can consume the generated configs.
+- [ ] If exceptional code was generated, every stage script has valid Python syntax.
+- [ ] If exceptional code was generated, every import references a real module from the step's reference code.
+- [ ] If a README was generated, its commands match the actual configs.
+- [ ] Smoke-test YAML configs use reduced iters, batch sizes, max_steps.
+- [ ] Tokenizer + seq_length aligned across prep ↔ train YAMLs.
+- [ ] No `${art:...}` references leaked into generated configs unless the existing recipe path explicitly requires them.
 
 If verification finds issues, fix them silently. Don't say "I noticed an issue."
 
 ---
 
-## Operational nuances (not in patterns/)
-
-These are generation-time concerns, not ML decision rules. Patterns own ML
-rules; this section owns what *this skill specifically* does.
-
-### `tiny.yaml` is for plumbing, not metrics
+## Operational Nuances
 
-Each step ships `config/default.yaml` (production) and `config/tiny.yaml`
-(smoke test: handful of iters, micro batch, tiny seqlen). Generated projects
-must mirror this and **default the CLI to `default`**. tiny is for verifying
-the wiring runs end-to-end on a cheap budget — never for evidence of model
-quality.
+- Smoke configs such as `tiny.yaml` or eval/model_eval's `tiny_chat.yaml` are for wiring tests, not model-quality evidence.
+- If a `step.toml` strategy points to unavailable upstream docs, use its `then:` text and mark the plan for manual review.
+- Preserve `${art:...}` only in recipe-backed configs; standalone YAML should use plain paths.
+- Keep pretraining `bin/idx` data and `blend.json` from the same Nemotron release.
 
-### Strategy `skill:` pointers may not resolve
+## Examples
 
-Many `[[strategies]]` blocks in `step.toml` carry a `skill:` pointer
-(`Megatron-Bridge/skills/perf-techniques/...`, `Automodel/docs/guides/...`).
-Those paths live in upstream repos, not here. If you can't read them, **don't
-fail** — use the `then:` text as guidance and put a `⚠` in the plan: "Could
-not read perf-tuning docs for `<topic>` — config may need manual review."
-
-### `${art:...}` belongs only to recipes/, not generated projects
-
-The reference recipes under [src/nemotron/recipes/](../../src/nemotron/recipes/)
-use `${art:data,path}`, `${art:model,iteration}` for W&B-Artifacts lineage.
-**Don't propagate `${art:...}` into generated stage configs** — they get
-plain DATA_ROOT layout instead (see [act/PROJECT.md](act/PROJECT.md) R2).
-
-### `bin/idx + blend.json` is version-coupled
-
-Pretraining data prep produces `binidx` plus a `blend.json` manifest. The
-`pretrain/megatron_bridge` step reads it via `dataset.data_paths`. **The two
-must come from the same Nemotron release** — don't mix a freshly-prepped
-blend with a six-month-old recipe. When the user can't reprep, surface a
-`⚠`.
+- Single step: read the manifest/config/env profile, then return a complete
+  `uv run nemotron steps run <step_id> -c <config> --dry-run` command.
+- Pipeline: plan the step DAG first, validate artifact edges, then create only
+  the project YAML overlays needed for the approved stages.
 
 ---
 
@@ -337,94 +348,53 @@ blend with a six-month-old recipe. When the user can't reprep, surface a
 
 Fast path. Levels 0 → 2 in Orient, then Plan → Act.
 
-`STEPS.md → category/SKILL.md → step.toml → step.py → write code`
+`STEPS.md → category/SKILL.md → step.toml → step.py → adapt YAML config`
 
 Use whenever the user's request maps to a step in the catalog.
 
-### Explorer mode — no step, but a library supports it
+### Explorer mode — no repo path supports it
 
-1. Look at libraries cited in nearby `step.toml [reference]` URLs.
-2. Read the relevant library docs / examples.
-3. Use [types.toml](../../src/nemotron/steps/types.toml) to type the new
+1. Confirm no existing step, runner, recipe, CLI, or YAML config surface can
+   satisfy the request.
+2. Look at libraries cited in nearby `step.toml [reference]` URLs.
+3. Read the relevant library docs / examples.
+4. Use [types.toml](src/nemotron/steps/types.toml) to type the new
    stage's consumes/produces.
-4. Write the stage from scratch, mirroring an existing `step.py` as a template.
+5. Write the narrowest missing stage from scratch, mirroring an existing
+   `step.py` as a template.
 
 Tell the user: "This use case doesn't have a pre-built step. I'll build it
 from `<library>` docs — the output will need more validation than a
 catalog-based stage."
 
 If the same Explorer build keeps appearing across projects, suggest the user
-run `/nemotron-add-step` to land it in the catalog.
+contribute it as a new catalog step under `src/nemotron/steps/`.
 
 ### Choosing a mode
 
 | User says | Mode |
 |---|---|
 | "SFT with Megatron-Bridge / AutoModel" | Catalog |
-| "Distill / quantize / prune a model" | Catalog ([optimize/modelopt/*](../../src/nemotron/steps/optimize/modelopt/)) |
-| "DPO / RLVR / GRPO / RLHF" | Catalog ([rl/nemo_rl/*](../../src/nemotron/steps/rl/nemo_rl/)) |
-| "Synthesize preference / SFT data" | Catalog ([sdg/data_designer](../../src/nemotron/steps/sdg/data_designer/)) |
-| "Translate EN → \<lang\>" | Catalog ([translate/nemo_skills](../../src/nemotron/steps/translate/nemo_skills/)) |
-| "Curate web text" | Catalog ([curate/nemo_curator](../../src/nemotron/steps/curate/nemo_curator/)) |
-| "Deploy to TensorRT-LLM" | Explorer (no step yet — derive from upstream library docs and add a `convert/*` step if the path stabilizes) |
+| "DPO / RLVR / GRPO / RLHF" | Catalog ([rl/nemo_rl/*](src/nemotron/steps/rl/nemo_rl/)) |
+| "Synthesize preference / SFT data" | Catalog ([sdg/data_designer](src/nemotron/steps/sdg/data_designer/)) |
+| "Translate EN → \<lang\> for training data" | Catalog ([translate/nemo_curator](src/nemotron/steps/translate/nemo_curator/)) |
+| "Curate web text" | Catalog ([curate/nemo_curator](src/nemotron/steps/curate/nemo_curator/)) |
 | "Train with X exotic backend" | Explorer or **ask** |
+| Post-training-only request | Out of scope for this skill; ask the user to use a more appropriate workflow. |
 | Ambiguous | **Ask** |
 
 ---
 
-## Domain vocabulary
-
-### Step vs stage
-
-- **Step** = abstract building block in [src/nemotron/steps/](../../src/nemotron/steps/) (e.g. "SFT with Megatron-Bridge"). No position, no customer config.
-- **Stage** = a step instantiated in a generated project (e.g. "stage 03: SFT for Thai Nano3"). Has a number, wired inputs, customer-specific YAML.
-
-Use "step" for the catalog, "stage" for the generated project.
-
-### Artifact graph
-
-```
-raw_jsonl ─is_a─> training_jsonl ─prep─> packed_parquet ─sft─> checkpoint_megatron
-                                                                      │
-                                                                  convert_to
-                                                                      ▼
-                                                                checkpoint_hf ─eval─> eval_results
-```
-
-Definitions in [types.toml](../../src/nemotron/steps/types.toml).
-
-### Config hierarchy
-
-```
-config/default.yaml  →  recipe defaults  →  CLI overrides
-```
-
-Plain OmegaConf YAML + `parse_hydra_overrides`. **Never** generate Hydra
-configs.
-
----
-
-## Tool preferences
-
-- **Catalog discovery**: `nemotron step list --json --consumes <type>` — don't grep `**/step.toml`.
-- **Manifest read**: `nemotron step show <id>` — fastest single read.
-- **Context packs**: load one large pack per stage via Act sub-agent — beats many small reads.
-- **Step.py read**: full file — they're <100 lines.
-- **Type validation**: read [types.toml](../../src/nemotron/steps/types.toml) once during Orient; keep in context through Verify.
-- **Parallel reads**: batch step.toml + category SKILL.md reads.
-
----
-
 ## Boundaries
 
 ### Do
 
 - Build pipelines from steps that exist; cite step.toml fields directly.
+- Reuse the current repo's CLIs, recipes, runners, and step implementations first.
 - Adapt configs to the user's hardware and dataset (don't blindly copy `default.yaml`).
 - Fire strategies and follow `skill:` pointers when perf-tuning.
-- Insert converter steps when artifact types don't chain.
-- Ask about hardware, data, deploy target — never assume.
-- Generate both `default.yaml` and `tiny.yaml` for every stage.
+- Ask about hardware, data, backend, and output path — never assume.
+- Generate only the YAML configs needed for the approved request.
 - Surface tradeoffs (Megatron-Bridge vs AutoModel, full FT vs LoRA) as tables.
 - Present the plan and wait for approval.
 
@@ -432,12 +402,14 @@ configs.
 
 - Invent steps. Use Explorer mode or ask.
 - Skip Plan for any pipeline ≥2 stages.
+- Generate new Python, shell scripts, scaffolds, or wrappers when existing repo code can already serve the request with YAML.
 - Import from modules not present in the step's reference code.
 - Add monitoring / logging / W&B unless the user asks.
 - Tune parallelism beyond what `hardware.md` and `[[strategies]]` advise.
 - Assume GPU count, type, or interconnect.
-- Generate Slurm/Airflow/Kubeflow wrappers unless requested.
-- Modify [src/nemotron/steps/](../../src/nemotron/steps/). To extend the catalog, route the user to `/nemotron-add-step`.
+- Generate Slurm/Airflow/Kubeflow wrappers.
+- Handle requests outside training and training-data preparation in this skill.
+- Modify [src/nemotron/steps/](src/nemotron/steps/). To extend the catalog, point the user to the contribution workflow in `CONTRIBUTING.md`.
 - Restate per-step rules in this skill — link to the step's `SKILL.md` instead.
 
 ---
@@ -446,19 +418,17 @@ configs.
 
 | Situation | Action |
 |---|---|
-| No step matches the user's request | Check libraries cited in nearby `step.toml [reference]`. If supported, use Explorer mode. Otherwise ask. |
-| Artifact types won't chain | Look up `convert_to` in [types.toml](../../src/nemotron/steps/types.toml). If a converter exists, add it. Otherwise: explain the gap and ask. |
-| Strategy points to a missing skill file | Skip the load. Use the `then:` text as guidance. Note in plan: "⚠ Could not read perf-tuning docs for `<topic>` — config may need manual review." |
+| No existing repo path matches the user's request | Check libraries cited in nearby `step.toml [reference]`. If supported, use Explorer mode. Otherwise ask. |
+| Artifact types won't chain | Explain the gap and ask the user whether to change the training/data-prep plan. Do not add post-training work here. |
+| Strategy points to a missing skill file | Skip the load. Use the `then:` text as guidance. Note in plan: "WARNING: Could not read perf-tuning docs for `<topic>` — config may need manual review." |
 | User's hardware is too small | Show the relevant `[[models]]` `min_gpus` table. Suggest in order: smaller model → AutoModel → LoRA. |
 | Two failed Act attempts | Stop. Explain what was tried, what failed, ask the user how to proceed. |
-| User wants a feature that crosses 3+ projects | Build it Explorer-mode for them now. Then suggest `/nemotron-add-step` to land it in the catalog. |
-
----
+| User wants a feature that crosses 3+ projects | Confirm YAML and existing repo code cannot serve it. If not, build it Explorer-mode for them now, then suggest contributing it as a new step under `src/nemotron/steps/`. |
 
-## Related skills
+## Troubleshooting
 
-- **[/nemotron-nano3](../nemotron-nano3/SKILL.md)** — facts about Nano3 (architecture, data, recipes, eval). Hands off here for "build me a pipeline."
-- **[/nemotron-super3](../nemotron-super3/SKILL.md)** — facts about Super3.
-- **[/nemotron-add-step](../nemotron-add-step/SKILL.md)** — extend the step catalog when Explorer mode keeps recurring.
-- **[/nemotron-add-pattern](../nemotron-add-pattern/SKILL.md)** — encode a new cross-cutting decision rule.
-- **[/nemotron-add-model](../nemotron-add-model/SKILL.md)** — onboard a new model family.
+| Symptom | Action |
+|---|---|
+| Artifact types do not chain | Recheck `types.toml` and change the DAG before writing configs |
+| Remote profile is unclear | Read the active env TOML; do not guess `--batch` |
+| Config key is unclear | Read the step config, `step.py`, and shared runner before editing |
diff --git a/skills/nemotron-customize/context/automodel-launcher-executor-modes.txt b/skills/nemotron-customize/context/automodel-launcher-executor-modes.txt
deleted file mode 100644
index 85bc99e46..000000000
--- a/skills/nemotron-customize/context/automodel-launcher-executor-modes.txt
+++ /dev/null
@@ -1,2163 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Automodel
-├── docs
-│   ├── launcher
-│   │   ├── local-workstation.md *
-│   │   ├── nemo-run.md *
-│   │   ├── overview.md *
-│   │   ├── skypilot.md *
-│   │   └── slurm.md *
-│   ├── about
-│   ├── guides
-│   │   ├── diffusion
-│   │   ├── llm
-│   │   ├── omni
-│   │   └── vlm
-│   └── model-coverage
-│       ├── diffusion
-│       │   ├── black-forest-labs
-│       │   ├── hunyuanvideo-community
-│       │   └── wan-ai
-│       ├── llm
-│       │   ├── allenai
-│       │   ├── baai
-│       │   ├── baichuan-inc
-│       │   ├── bigcode
-│       │   ├── bytedance-seed
-│       │   ├── cohere
-│       │   ├── deepseek-ai
-│       │   ├── eleutherai
-│       │   ├── google
-│       │   ├── ibm
-│       │   ├── inceptionai
-│       │   ├── internlm
-│       │   ├── lgai-exaone
-│       │   ├── meta
-│       │   ├── microsoft
-│       │   ├── minimax
-│       │   ├── mistralai
-│       │   ├── moonshotai
-│       │   ├── nvidia
-│       │   ├── openai
-│       │   ├── openbmb
-│       │   ├── orionstar
-│       │   ├── parasail-ai
-│       │   ├── qwen
-│       │   ├── stabilityai
-│       │   ├── stepfun-ai
-│       │   ├── thudm
-│       │   ├── tiiuae
-│       │   └── upstage
-│       ├── omni
-│       │   ├── microsoft
-│       │   └── qwen
-│       └── vlm
-│           ├── google
-│           ├── huggingface
-│           ├── internlm
-│           ├── llava-hf
-│           ├── meta
-│           ├── mistralai
-│           ├── moonshotai
-│           ├── nvidia
-│           └── qwen
-├── examples
-│   ├── llm_finetune
-│   │   ├── llama3_2
-│   │   │   ├── llama3_2_1b_squad.yaml *
-│   │   │   └── llama3_2_1b_squad_skypilot.yaml *
-│   │   ├── baichuan
-│   │   ├── cohere
-│   │   ├── deepseek_v32
-│   │   ├── devstral
-│   │   ├── falcon
-│   │   ├── gemma
-│   │   ├── glm
-│   │   ├── gpt_oss
-│   │   ├── granite
-│   │   ├── llama3_1
-│   │   ├── llama3_3
-│   │   ├── minimax_m2
-│   │   ├── mistral
-│   │   ├── moonlight
-│   │   ├── nemotron
-│   │   ├── nemotron_flash
-│   │   ├── olmo
-│   │   ├── phi
-│   │   ├── qwen
-│   │   ├── seed
-│   │   ├── starcoder
-│   │   └── stepfun
-│   ├── convergence
-│   │   └── tulu3
-│   │       ├── data
-│   │       ├── eval
-│   │       ├── inference
-│   │       ├── model-verification
-│   │       ├── models
-│   │       │   ├── gpt-oss-20b
-│   │       │   │   └── assets
-│   │       │   ├── moonlight-16b
-│   │       │   │   └── assets
-│   │       │   ├── qwen3-4b
-│   │       │   │   └── assets
-│   │       │   └── qwen3-moe-30b
-│   │       │       ├── assets
-│   │       │       └── experiments
-│   │       └── training
-│   ├── diffusion
-│   │   ├── finetune
-│   │   ├── generate
-│   │   │   └── configs
-│   │   └── pretrain
-│   ├── dllm_generate
-│   ├── dllm_sft
-│   ├── llm_benchmark
-│   │   ├── deepseek
-│   │   ├── glm
-│   │   ├── gpt_oss
-│   │   ├── kimi
-│   │   ├── llama3_3
-│   │   ├── minimax
-│   │   ├── mistral
-│   │   ├── moonlight
-│   │   ├── nemotron
-│   │   ├── qwen
-│   │   └── step
-│   ├── llm_kd
-│   │   └── llama3_2
-│   ├── llm_pretrain
-│   ├── llm_seq_cls
-│   │   └── glue
-│   ├── retrieval
-│   │   ├── bi_encoder
-│   │   │   └── llama_embed_nemotron_8b
-│   │   ├── cross_encoder
-│   │   └── data_utils
-│   ├── vlm_benchmark
-│   │   ├── kimi
-│   │   ├── mistral
-│   │   └── qwen
-│   ├── vlm_finetune
-│   │   ├── gemma3
-│   │   ├── gemma3n
-│   │   ├── gemma4
-│   │   ├── internvl
-│   │   ├── kimi
-│   │   ├── mistral
-│   │   ├── mistral4
-│   │   ├── nemotron
-│   │   ├── phi4
-│   │   ├── qwen2_5
-│   │   ├── qwen3
-│   │   ├── qwen3_5
-│   │   └── qwen3_5_moe
-│   └── vlm_generate
-├── nemo_automodel
-│   ├── components
-│   │   ├── launcher
-│   │   │   ├── nemo_run
-│   │   │   │   ├── config.py * +
-│   │   │   │   └── launcher.py * +
-│   │   │   ├── skypilot
-│   │   │   │   ├── config.py * +
-│   │   │   │   └── launcher.py * +
-│   │   │   ├── base.py * +
-│   │   │   └── interactive.py * +
-│   │   ├── _peft
-│   │   ├── attention
-│   │   ├── checkpoint
-│   │   │   └── _backports
-│   │   ├── config
-│   │   ├── datasets
-│   │   │   ├── diffusion
-│   │   │   ├── dllm
-│   │   │   ├── llm
-│   │   │   │   └── megatron
-│   │   │   └── vlm
-│   │   ├── distributed
-│   │   │   └── pipelining
-│   │   ├── flow_matching
-│   │   │   └── adapters
-│   │   ├── loggers
-│   │   ├── loss
-│   │   │   └── triton
-│   │   ├── models
-│   │   │   ├── baichuan
-│   │   │   ├── common
-│   │   │   ├── deepseek_v3
-│   │   │   ├── deepseek_v32
-│   │   │   ├── gemma4_moe
-│   │   │   ├── glm4_moe
-│   │   │   ├── glm4_moe_lite
-│   │   │   ├── glm_moe_dsa
-│   │   │   ├── gpt_oss
-│   │   │   ├── kimi_k25_vl
-│   │   │   ├── kimivl
-│   │   │   ├── llama
-│   │   │   ├── llama_bidirectional
-│   │   │   ├── minimax_m2
-│   │   │   ├── mistral3
-│   │   │   ├── mistral4
-│   │   │   ├── nemotron_parse
-│   │   │   ├── nemotron_v3
-│   │   │   ├── qwen2
-│   │   │   ├── qwen3_5_moe
-│   │   │   ├── qwen3_moe
-│   │   │   ├── qwen3_next
-│   │   │   ├── qwen3_omni_moe
-│   │   │   ├── qwen3_vl_moe
-│   │   │   └── step3p5
-│   │   ├── moe
-│   │   │   ├── megatron
-│   │   │   └── uccl_ep
-│   │   ├── optim
-│   │   ├── quantization
-│   │   ├── training
-│   │   └── utils
-│   ├── _diffusers
-│   ├── _transformers
-│   │   └── tokenization
-│   ├── autonvtx
-│   ├── cli
-│   ├── recipes
-│   │   ├── diffusion
-│   │   ├── dllm
-│   │   ├── llm
-│   │   ├── retrieval
-│   │   └── vlm
-│   └── shared
-├── .github
-│   ├── actions
-│   │   ├── build-container
-│   │   └── test-template
-│   └── workflows
-│       └── config
-├── docker
-│   └── common
-├── scripts
-├── skills
-│   ├── .claude
-│   │   └── skills
-│   │       ├── developer-guide
-│   │       ├── distributed-training
-│   │       ├── launcher-config
-│   │       ├── model-onboarding
-│   │       ├── parity-testing
-│   │       └── recipe-development
-│   ├── developer-guide
-│   ├── distributed-training
-│   ├── launcher-config
-│   ├── model-onboarding
-│   ├── parity-testing
-│   └── recipe-development
-├── tests
-│   ├── ci_tests
-│   │   ├── configs
-│   │   │   ├── llm_benchmark
-│   │   │   ├── llm_finetune
-│   │   │   ├── vlm_benchmark
-│   │   │   └── vlm_finetune
-│   │   ├── golden_values
-│   │   │   ├── llm_finetune
-│   │   │   │   ├── baichuan
-│   │   │   │   ├── falcon
-│   │   │   │   ├── gemma
-│   │   │   │   ├── glm
-│   │   │   │   ├── gpt_oss
-│   │   │   │   ├── granite
-│   │   │   │   ├── llama3_1
-│   │   │   │   ├── llama3_2
-│   │   │   │   ├── mistral
-│   │   │   │   ├── moonlight
-│   │   │   │   ├── nemotron
-│   │   │   │   ├── nemotron_flash
-│   │   │   │   ├── olmo
-│   │   │   │   ├── phi
-│   │   │   │   ├── qwen
-│   │   │   │   ├── seed
-│   │   │   │   └── starcoder
-│   │   │   └── vlm_finetune
-│   │   │       ├── gemma3
-│   │   │       ├── gemma3n
-│   │   │       ├── internvl
-│   │   │       ├── mistral
-│   │   │       ├── nemotron
-│   │   │       ├── qwen2_5
-│   │   │       ├── qwen3
-│   │   │       └── qwen3_5_moe
-│   │   ├── scripts
-│   │   └── utils
-│   ├── functional_tests
-│   │   ├── checkpoint
-│   │   ├── checkpoint_robustness
-│   │   ├── context_parallel
-│   │   ├── data
-│   │   │   └── llm
-│   │   ├── datasets
-│   │   │   └── llm
-│   │   ├── hf_dcp
-│   │   ├── hf_peft
-│   │   ├── hf_transformer
-│   │   ├── hf_transformer_finetune
-│   │   ├── hf_transformer_llm
-│   │   ├── hf_transformer_vlm
-│   │   ├── llm_pretrain_and_kd
-│   │   │   ├── customizer_retrieval
-│   │   │   ├── llm_seq_cls
-│   │   │   └── loss
-│   │   ├── retrieval
-│   │   └── training
-│   ├── unit_tests
-│   │   ├── _cli
-│   │   ├── _diffusers
-│   │   ├── _peft
-│   │   ├── _transformers
-│   │   ├── attention
-│   │   ├── checkpoint
-│   │   ├── components
-│   │   │   └── training
-│   │   ├── config
-│   │   ├── datasets
-│   │   │   ├── diffusion
-│   │   │   ├── dllm
-│   │   │   ├── llm
-│   │   │   └── vlm
-│   │   ├── diffusion_processors
-│   │   ├── distributed
-│   │   │   └── pipelining
-│   │   ├── flow_matching
-│   │   │   └── adapters
-│   │   ├── launcher
-│   │   ├── loggers
-│   │   ├── loss
-│   │   ├── models
-│   │   │   ├── baichuan
-│   │   │   ├── bi_encoder
-│   │   │   ├── common
-│   │   │   ├── deepseek_v3
-│   │   │   ├── deepseek_v32
-│   │   │   ├── gemma4
-│   │   │   ├── glm4_moe
-│   │   │   ├── glm4_moe_lite
-│   │   │   ├── glm_moe_dsa
-│   │   │   ├── gpt_oss
-│   │   │   ├── kimi_k25_vl
-│   │   │   ├── kimivl
-│   │   │   ├── llama
-│   │   │   ├── minimax_m2
-│   │   │   ├── mistral3
-│   │   │   ├── mistral4
-│   │   │   ├── nemotron_parse
-│   │   │   ├── nemotron_v3
-│   │   │   ├── qwen2
-│   │   │   ├── qwen3_5
-│   │   │   ├── qwen3_5_moe
-│   │   │   ├── qwen3_moe
-│   │   │   ├── qwen3_next
-│   │   │   ├── qwen3_omni_moe
-│   │   │   ├── qwen3_vl_moe
-│   │   │   └── step3p5
-│   │   ├── moe
-│   │   ├── optim
-│   │   ├── quantization
-│   │   ├── recipes
-│   │   │   ├── dllm
-│   │   │   └── llm
-│   │   ├── shared
-│   │   ├── tools
-│   │   ├── training
-│   │   └── utils
-│   └── utils
-├── tools
-│   └── diffusion
-│       ├── data
-│       └── processors
-└── tutorials
-    └── nemotron-parse
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; selected files shown.
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/base.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-
-class Launcher(ABC):
-    """Base class for all job launchers (interactive, SLURM, SkyPilot, nemo-run)."""
-
-    @abstractmethod
-    def launch(
-        self,
-        config: Dict[str, Any],
-        config_path: Path,
-        recipe_target: str,
-        launcher_config: Any,
-        extra_args: Optional[List[str]] = None,
-    ) -> int:
-        """Launch a recipe job.
-
-        Args:
-            config: Parsed YAML config dict (without the launcher section).
-            config_path: Resolved path to the original YAML file.
-            recipe_target: Dotted import path of the recipe class.
-            launcher_config: Launcher-specific configuration (dict, int, or None).
-            extra_args: Additional CLI overrides forwarded to the recipe.
-
-        Returns:
-            Process exit code (0 = success).
-        """
-        ...
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/interactive.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from nemo_automodel.components.launcher.base import Launcher
-
-logger = logging.getLogger(__name__)
-
-
-def _get_repo_root() -> Path:
-    """Return the repository root.  If CWD looks like an editable checkout,
-    prepend it to ``PYTHONPATH`` so the local source takes precedence."""
-    cwd = Path.cwd()
-    if (cwd / "nemo_automodel/components").exists() and (cwd / "examples/").exists():
-        new_pp = str(cwd)
-        if "PYTHONPATH" in os.environ:
-            new_pp += ":" + os.environ["PYTHONPATH"]
-        os.environ["PYTHONPATH"] = new_pp
-        logger.info("Running job using source from: %s", cwd)
-        return cwd
-    return Path(__file__).parents[3]
-
-
-def resolve_recipe_cls(target_str: str):
-    """Import and return the recipe class from a dotted path.
-
-    "  pip install nemo-automodel          # CPU/basic\n"
-    "  pip install nemo-automodel[all]     # with CUDA & all extras\n\n"
-    """
-    module_path, cls_name = target_str.rsplit(".", 1)
-    module = importlib.import_module(module_path)
-    return getattr(module, cls_name)
-
-
-def _recipe_module_path(recipe_target: str, repo_root: Path) -> Path:
-    """Convert a dotted recipe target into an absolute filesystem path."""
-    module_path = recipe_target.rsplit(".", 1)[0]
-    relative = module_path.replace(".", "/") + ".py"
-    return repo_root / relative
-
-
-_INSTALL_MSG = (
-    "Local/interactive execution requires PyTorch and the full nemo_automodel package.\n"
-    "It looks like you have the lightweight CLI-only install (automodel[cli]).\n\n"
-    "To run jobs locally, install the full package:\n"
-    "  pip install nemo_automodel          # CPU/basic\n"
-    "  pip install nemo_automodel[all]     # with CUDA & all extras\n\n"
-    "For SLURM clusters, use sbatch with the reference slurm.sub script.\n"
-    "For SkyPilot or NeMo-Run, add a skypilot: or nemo_run: section to your YAML.\n\n"
-    "See: https://github.com/NVIDIA/NeMo-Automodel#readme"
-)
-
-
-class InteractiveLauncher(Launcher):
-    """Launch a recipe locally on the current node using torchrun or in-process."""
-
-    @staticmethod
-    def _is_torchrun_worker() -> bool:
-        """Return True when this process was already spawned by torchrun.
-
-        torchrun (``torch.distributed.run``) sets both ``LOCAL_RANK`` and
-        ``TORCHELASTIC_RUN_ID`` in the environment of every worker it spawns.
-        We check for both to avoid false positives from environments (e.g.
-        SLURM) that may set ``LOCAL_RANK`` without an active torchrun session.
-
-        When the user launches the CLI via
-        ``torchrun --nproc-per-node N -m nemo_automodel.cli.app config.yaml``,
-        each worker must run the recipe in-process instead of re-launching torchrun.
-        """
-        return "LOCAL_RANK" in os.environ and "TORCHELASTIC_RUN_ID" in os.environ
-
-    def _run_recipe_in_process(self, recipe_target: str, config: Dict[str, Any]) -> int:
-        """Instantiate and run a recipe in the current process."""
-        recipe_cls = resolve_recipe_cls(recipe_target)
-        recipe = recipe_cls(config)
-        recipe.setup()
-        return recipe.run_train_validation_loop()
-
-    def launch(
-        self,
-        config: Dict[str, Any],
-        config_path: Path,
-        recipe_target: str,
-        launcher_config: Any = None,
-        extra_args: Optional[List[str]] = None,
-    ) -> int:
-        try:
-            from torch.distributed.run import determine_local_world_size, get_args_parser
-            from torch.distributed.run import run as thrun
-        except ImportError:
-            logger.error(_INSTALL_MSG)
-            return 1
-
-        # Already inside a torchrun worker (e.g. user ran
-        # ``torchrun --nproc-per-node N -m nemo_automodel.cli.app config.yaml``).
-        # Run the recipe directly; do NOT re-launch torchrun.
-        if self._is_torchrun_worker():
-            logger.info(
-                "Detected existing torchrun environment (LOCAL_RANK=%s); running recipe in-process.",
-                os.environ["LOCAL_RANK"],
-            )
-            return self._run_recipe_in_process(recipe_target, config)
-
-        nproc_per_node: Optional[int] = launcher_config
-        repo_root = _get_repo_root()
-        script_path = _recipe_module_path(recipe_target, repo_root)
-
-        num_devices = determine_local_world_size(nproc_per_node="gpu")
-        assert num_devices > 0, "Expected num-devices to be > 0"
-
-        if nproc_per_node == 1 or num_devices == 1:
-            logger.info("Launching job locally on a single device")
-            return self._run_recipe_in_process(recipe_target, config)
-        else:
-            effective_nproc = nproc_per_node if nproc_per_node is not None else num_devices
-            logger.info("Launching job locally on %d devices", effective_nproc)
-
-            torchrun_parser = get_args_parser()
-            torchrun_args, _ = torchrun_parser.parse_known_args()
-            torchrun_args.training_script = str(script_path)
-            torchrun_args.training_script_args = ["-c", str(config_path)]
-            if extra_args:
-                torchrun_args.training_script_args.extend(extra_args)
-            torchrun_args.nproc_per_node = effective_nproc
-            return thrun(torchrun_args)
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/nemo_run/config.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import os
-from dataclasses import dataclass, field
-
-# Default path to user-defined executor definitions.
-# Respects the NEMORUN_HOME env var used by nemo-run itself (defaults to ~/.nemo_run).
-_NEMORUN_HOME = os.environ.get("NEMORUN_HOME", os.path.join(os.path.expanduser("~"), ".nemo_run"))
-DEFAULT_EXECUTORS_FILE = os.path.join(_NEMORUN_HOME, "executors.py")
-
-# Keys that belong to NemoRunConfig itself (not executor overrides).
-_LAUNCHER_KEYS = frozenset(
-    {
-        "executor",
-        "job_name",
-        "detach",
-        "tail_logs",
-        "executors_file",
-        "job_dir",
-        "overrides",
-    }
-)
-
-
-@dataclass
-class NemoRunConfig:
-    """Configuration for the NeMo-Run launcher backend.
-
-    The ``executor`` field selects a named executor from
-    ``$NEMORUN_HOME/executors.py``, or ``"local"`` for local execution.
-
-    Any key not recognised as a launcher setting is collected into
-    ``overrides`` and applied directly to the executor via ``setattr``.
-    This means any executor attribute (``nodes``, ``partition``,
-    ``container_image``, ``time``, ``env_vars``, etc.) can be overridden
-    from YAML without changes to this config class.
-    """
-
-    # Executor selection: name from EXECUTOR_MAP or "local"
-    executor: str = "local"
-
-    # Job metadata
-    job_name: str = ""
-
-    # Experiment behaviour
-    detach: bool = True
-    tail_logs: bool = False
-
-    # Path to executor definitions file
-    executors_file: str = field(default_factory=lambda: DEFAULT_EXECUTORS_FILE)
-
-    # Local directory for job artifacts (config snapshot, logs)
-    job_dir: str = ""
-
-    # Arbitrary executor attribute overrides (e.g. nodes, partition,
-    # container_image, time, env_vars).  Populated automatically from
-    # unrecognised YAML keys by ``from_dict``.
-    overrides: dict = field(default_factory=dict)
-
-    @classmethod
-    def from_dict(cls, d: dict) -> "NemoRunConfig":
-        """Build from a raw YAML dict, splitting launcher keys from executor overrides."""
-        launcher_kwargs = {}
-        overrides = {}
-        for k, v in d.items():
-            if k in _LAUNCHER_KEYS:
-                launcher_kwargs[k] = v
-            else:
-                overrides[k] = v
-        launcher_kwargs.setdefault("overrides", {}).update(overrides)
-        return cls(**launcher_kwargs)
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/nemo_run/launcher.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import time as _time
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import yaml
-
-from nemo_automodel.components.launcher.base import Launcher
-from nemo_automodel.components.launcher.nemo_run.config import NemoRunConfig
-from nemo_automodel.components.launcher.nemo_run.utils import (
-    apply_overrides,
-    load_executor_from_file,
-    submit_nemo_run_job,
-)
-
-logger = logging.getLogger(__name__)
-
-# Config filename and its path inside the container (/nemo_run/code/).
-_CONFIG_FILENAME = "automodel_config.yaml"
-_REMOTE_CONFIG_PATH = f"/nemo_run/code/{_CONFIG_FILENAME}"
-
-
-class NemoRunLauncher(Launcher):
-    """Launch a recipe via NeMo-Run's executor API.
-
-    Supports loading pre-configured executors from ``$NEMORUN_HOME/executors.py``
-    (or a custom path) and submitting jobs as ``nemo_run.Script`` objects.
-    Works with any NeMo-Run executor backend (Slurm, Kubernetes, Docker, local).
-
-    Uses NeMo-Run's native ``Torchrun`` launcher so that distributed training
-    arguments (rendezvous, node rank, nproc-per-node) are managed automatically.
-    The training config YAML is packaged via ``PatternPackager`` so it is
-    available at ``/nemo_run/code/automodel_config.yaml`` inside the container.
-    """
-
-    def _resolve_executor(self, nr_config: NemoRunConfig) -> Any:
-        """Load a named executor or build a local one."""
-        try:
-            import nemo_run as run
-        except ImportError:
-            logger.error("nemo-run is not installed. Install with: pip install nemo-run")
-            sys.exit(1)
-
-        if nr_config.executor == "local":
-            executor = run.LocalExecutor()
-            apply_overrides(executor, nr_config.overrides)
-            return executor
-
-        # Named executor from executors file
-        executor = load_executor_from_file(nr_config.executor, nr_config.executors_file)
-        apply_overrides(executor, nr_config.overrides)
-        return executor
-
-    @staticmethod
-    def _configure_torchrun(executor: Any, devices: int) -> None:
-        """Enable the native NeMo-Run Torchrun launcher on *executor*.
-
-        Sets ``executor.launcher = "torchrun"`` and
-        ``torchrun_nproc_per_node`` so NeMo-Run generates the correct
-        ``torchrun --nproc-per-node=<N>`` invocation in the sbatch script.
-        """
-        executor.launcher = "torchrun"
-        if hasattr(executor, "torchrun_nproc_per_node"):
-            executor.torchrun_nproc_per_node = devices
-
-    @staticmethod
-    def _setup_packager(executor: Any, config_path: str) -> None:
-        """Configure a ``PatternPackager`` that ships the config YAML.
-
-        The packager tars the config file and NeMo-Run extracts it into
-        ``{job_dir}/code/``, which is mounted at ``/nemo_run/code/`` inside
-        the container.
-        """
-        try:
-            import nemo_run as run
-        except ImportError:
-            return
-
-        config_dir = os.path.dirname(config_path)
-        executor.packager = run.PatternPackager(
-            include_pattern=config_path,
-            relative_path=config_dir,
-        )
-
-    def launch(
-        self,
-        config: Dict[str, Any],
-        config_path: Path,
-        recipe_target: str,
-        launcher_config: Dict[str, Any],
-        extra_args: Optional[List[str]] = None,
-    ) -> int:
-        try:
-            import nemo_run as run
-        except ImportError:
-            logger.error("nemo-run is not installed. Install with: pip install nemo-run")
-            sys.exit(1)
-
-        nr_config = NemoRunConfig.from_dict(launcher_config)
-        executor = self._resolve_executor(nr_config)
-
-        # Determine devices (GPUs per node) via the executor's standard
-        # nproc_per_node() method (defined on the base Executor class and
-        # implemented by every backend).
-        try:
-            devices = executor.nproc_per_node()
-        except (NotImplementedError, AttributeError):
-            devices = 1
-
-        # Enable native Torchrun launcher (must be set *before* experiment.run
-        # because NeMo-Run reads it during the packaging phase).
-        self._configure_torchrun(executor, devices)
-
-        # -- Write the training config for both local record and packaging. --
-        job_dir = os.path.join(
-            nr_config.job_dir or os.path.join(os.getcwd(), "nemo_run_jobs"),
-            str(int(_time.time())),
-        )
-        os.makedirs(job_dir, exist_ok=True)
-        config_yaml = yaml.dump(config, default_flow_style=False, sort_keys=False)
-
-        # Local record.
-        local_config_path = os.path.join(job_dir, _CONFIG_FILENAME)
-        with open(local_config_path, "w") as fp:
-            fp.write(config_yaml)
-        logger.info("NeMo-Run job artifacts in: %s", job_dir)
-
-        # Set up PatternPackager so the config is shipped to the remote.
-        self._setup_packager(executor, local_config_path)
-
-        # Build the Script: use ``python -m <module>`` so the recipe is resolved
-        # from the installed package, not a relative file path.
-        module_path = recipe_target.rsplit(".", 1)[0]
-        args = ["-c", _REMOTE_CONFIG_PATH]
-        if extra_args:
-            args.extend(extra_args)
-
-        script = run.Script(
-            path=module_path,
-            m=True,
-            entrypoint="python",
-            args=args,
-        )
-        job_name = nr_config.job_name or f"{recipe_target.rsplit('.', 1)[-1]}"
-
-        return submit_nemo_run_job(
-            script=script,
-            executor=executor,
-            job_name=job_name,
-            detach=nr_config.detach,
-            tail_logs=nr_config.tail_logs,
-        )
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/skypilot/config.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import os
-from dataclasses import dataclass, field
-
-SUPPORTED_CLOUDS = ("aws", "gcp", "azure", "lambda", "kubernetes")
-
-
-@dataclass
-class SkyPilotConfig:
-    # Required: cloud provider
-    cloud: str = field(metadata=dict(help=f"Cloud provider. One of: {SUPPORTED_CLOUDS}"))
-
-    # Compute resources
-    accelerators: str = field(default="T4:1", metadata=dict(help="GPU type and count per node, e.g. 'T4:1', 'A100:8'"))
-    num_nodes: int = field(default=1, metadata=dict(help="Number of nodes for distributed training"))
-    use_spot: bool = field(default=True, metadata=dict(help="Use spot/preemptible instances for cost savings"))
-    disk_size: int = field(default=100, metadata=dict(help="Disk size in GB"))
-    instance_type: str | None = field(
-        default=None, metadata=dict(help="Specific cloud instance type; auto-selected if None")
-    )
-
-    # Cloud location
-    region: str | None = field(default=None, metadata=dict(help="Cloud region"))
-    zone: str | None = field(default=None, metadata=dict(help="Availability zone within the region"))
-
-    # Job identity
-    job_name: str = field(default="", metadata=dict(help="Job and SkyPilot cluster name"))
-
-    # Remote environment
-    setup: str = field(default="", metadata=dict(help="Shell commands run on the remote VM before training starts"))
-    hf_home: str = field(
-        default="~/.cache/huggingface",
-        metadata=dict(help="HuggingFace cache directory on the remote VM"),
-    )
-
-    # Credentials (sourced from env by default, never hard-coded)
-    hf_token: str = field(
-        default_factory=lambda: os.environ.get("HF_TOKEN", ""),
-        metadata=dict(help="HuggingFace token for gated model access"),
-    )
-    wandb_key: str = field(
-        default_factory=lambda: os.environ.get("WANDB_API_KEY", ""),
-        metadata=dict(help="Weights & Biases API key"),
-    )
-    env_vars: dict[str, str] = field(
-        default_factory=dict,
-        metadata=dict(help="Additional environment variables to set on the remote VM"),
-    )
-
-    # Training command (set programmatically by the launcher, not exposed in YAML)
-    command: str = field(default="", metadata=dict(help="Training command executed on the remote VM"))
-
-    def __post_init__(self) -> None:
-        if self.cloud.lower() not in SUPPORTED_CLOUDS:
-            raise ValueError(f"'cloud' must be one of {SUPPORTED_CLOUDS}, got: {self.cloud!r}")
-        if self.num_nodes < 1:
-            raise ValueError(f"'num_nodes' must be >= 1, got: {self.num_nodes}")
-        if self.disk_size < 1:
-            raise ValueError(f"'disk_size' must be >= 1 GB, got: {self.disk_size}")
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/launcher/skypilot/launcher.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import yaml
-
-from nemo_automodel.components.launcher.base import Launcher
-from nemo_automodel.components.launcher.skypilot.config import SkyPilotConfig
-from nemo_automodel.components.launcher.skypilot.utils import REMOTE_CONFIG_PATH
-
-logger = logging.getLogger(__name__)
-
-
-def _parse_gpus_per_node(accelerators: str) -> int:
-    """Extract GPU count from an accelerator string like ``'A100:8'``.
-
-    Returns 1 when the string cannot be parsed.
-    """
-    parts = accelerators.split(":")
-    if len(parts) == 2:
-        try:
-            return int(parts[1])
-        except ValueError:
-            pass
-    return 1
-
-
-def _recipe_module_path(recipe_target: str, repo_root: str) -> str:
-    module_path = recipe_target.rsplit(".", 1)[0]
-    return os.path.join(repo_root, module_path.replace(".", "/") + ".py")
-
-
-class SkyPilotLauncher(Launcher):
-    """Launch a recipe job on a cloud VM via SkyPilot."""
-
-    def _build_command(
-        self,
-        recipe_target: str,
-        job_conf_path: str,
-        gpus_per_node: int,
-        num_nodes: int,
-        extra_args: Optional[List[str]] = None,
-    ) -> str:
-        repo_root = "~/sky_workdir"
-        script_path = _recipe_module_path(recipe_target, repo_root)
-
-        parts = [
-            f"PYTHONPATH={repo_root}:$PYTHONPATH",
-            "torchrun",
-            f"--nproc_per_node={gpus_per_node}",
-        ]
-
-        if num_nodes > 1:
-            parts += [
-                "--nnodes=$SKYPILOT_NUM_NODES",
-                "--node_rank=$SKYPILOT_NODE_RANK",
-                "--rdzv_backend=c10d",
-                "--master_addr=$(echo $SKYPILOT_NODE_IPS | head -n1)",
-                "--master_port=12375",
-            ]
-
-        parts += [script_path, "-c", job_conf_path]
-
-        if extra_args:
-            parts.extend(extra_args)
-
-        return " ".join(parts)
-
-    def launch(
-        self,
-        config: Dict[str, Any],
-        config_path: Path,
-        recipe_target: str,
-        launcher_config: Dict[str, Any],
-        extra_args: Optional[List[str]] = None,
-    ) -> int:
-        from nemo_automodel.components.launcher.skypilot.utils import submit_skypilot_job
-
-        skypilot_cfg = dict(launcher_config)
-
-        job_dir = os.path.join(
-            skypilot_cfg.pop("job_dir", os.path.join(os.getcwd(), "skypilot_jobs")),
-            str(int(time.time())),
-        )
-        os.makedirs(job_dir, exist_ok=True)
-
-        # Write the training config (without skypilot section) for upload.
-        job_conf_path = os.path.join(job_dir, "job_config.yaml")
-        with open(job_conf_path, "w") as fp:
-            yaml.dump(config, fp, default_flow_style=False, sort_keys=False)
-        logger.info("SkyPilot job artifacts in: %s", job_dir)
-
-        accelerators = skypilot_cfg.get("accelerators", "T4:1")
-        gpus_per_node = skypilot_cfg.pop("gpus_per_node", None) or _parse_gpus_per_node(accelerators)
-        num_nodes = skypilot_cfg.get("num_nodes", 1)
-
-        command = self._build_command(
-            recipe_target,
-            REMOTE_CONFIG_PATH,
-            gpus_per_node,
-            num_nodes,
-            extra_args=extra_args,
-        )
-
-        job_name = skypilot_cfg.pop("job_name", "") or f"{recipe_target.rsplit('.', 1)[-1]}"
-
-        sky_config = SkyPilotConfig(
-            command=command,
-            job_name=job_name,
-            **{k: v for k, v in skypilot_cfg.items() if k in SkyPilotConfig.__dataclass_fields__},
-        )
-
-        return submit_skypilot_job(sky_config, job_dir)
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/launcher/overview.md
-```md
-# Job Launchers
-
-NeMo AutoModel provides several ways to launch training. The right choice depends on your hardware and environment.
-
-## Which Launcher Should I Use?
-
-| Launcher | Best for | GPUs | Guide |
-|---|---|---|---|
-| **Local Workstation** | Getting started, debugging, single-node training | 1-8 on one machine | [Local Workstation](./local-workstation.md) |
-| **Slurm** | Multi-node batch jobs on HPC clusters | 8+ across nodes | [Slurm](./slurm.md) |
-| **NeMo-Run** | Managed execution on Slurm, Kubernetes, Docker, local | 1+ | [NeMo-Run](./nemo-run.md) |
-| **SkyPilot** | Cloud training (AWS, GCP, Azure) with spot pricing | Any | [SkyPilot](./skypilot.md) |
-
-### I have 1-2 GPUs on my workstation
-
-Use the **interactive** launcher. No scheduler or cluster software needed:
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-See the [Local Workstation](./local-workstation.md) guide.
-
-### I have access to a Slurm cluster
-
-Add a `slurm:` section to your YAML config and submit with the same `automodel` command. The CLI generates the `torchrun` invocation and calls `sbatch` for you:
-
-```bash
-automodel config_with_slurm.yaml
-```
-
-See the [Slurm](./slurm.md) guide.
-
-### I want managed job submission (Slurm, Kubernetes, Docker)
-
-Add a `nemo_run:` section to your YAML config. NeMo-Run loads a pre-configured executor for your compute target and submits the job:
-
-```bash
-automodel config_with_nemo_run.yaml
-```
-
-See the [NeMo-Run](./nemo-run.md) guide.
-
-### I want to train on the cloud
-
-Add a `skypilot:` section to your YAML config. SkyPilot provisions VMs on any major cloud and handles spot-instance preemption automatically:
-
-```bash
-automodel config_with_skypilot.yaml
-```
-
-See the [SkyPilot](./skypilot.md) guide.
-
-## All Launchers Use the Same Config
-
-Every launcher shares the same YAML recipe format. The only difference is an optional launcher section (`slurm:`, `nemo_run:`, or `skypilot:`) that tells the CLI where to run. Without a launcher section, training runs interactively on the current machine.
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/launcher/local-workstation.md
-```md
-# Run on Your Local Workstation
-
-Use this guide for local, single-node workflows on a workstation or an interactive Slurm allocation. For setup details, refer to our [Installation Guide](../guides/installation.md).
-For batch multi-node jobs, see the [Slurm](./slurm.md) or [SkyPilot](./skypilot.md) guides.
-
-NeMo AutoModel uses recipes to run end-to-end workflows. If you're new to recipes, see the [Repository Structure](../repository-structure.md) guide.
-
-## Quick Start: Choose Your Job Launch Option
-
-- **CLI (recommended)**
-  ```bash
-  automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-  ```
-
-- **Direct recipe script**
-  - Single GPU
-    ```bash
-    python nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-    ```
-  - Multi-GPU (single node)
-    ```bash
-    torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-    ```
-
-## Run with AutoModel CLI (Single Node)
-
-The AutoModel CLI is the preferred method for most users. It offers a unified interface to launch training scaling from a local workstation (this guide) to large clusters (see our [cluster guide](./slurm.md)).
-
-### Basic Usage
-
-The CLI follows this format:
-```bash
-automodel [--nproc-per-node N] <config.yaml> [--key.subkey=override ...]
-```
-
-A short alias `am` is also available. Both commands also work with `uv run` (e.g., `uv run automodel <config.yaml>`).
-
-Where:
-- `<config.yaml>`: Path to your YAML configuration file (must contain a `recipe._target_` key)
-- `--nproc-per-node`: Optional override for the number of GPUs to use
-
-The recipe class is specified inside the YAML via the `recipe._target_` key:
-```yaml
-recipe:
-  _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-```
-
-### Train on a Single GPU
-
-For simple fine-tuning on a single GPU:
-
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Train on Multiple GPUs (Single Node)
-
-For interactive single-node jobs, the CLI automatically detects the number of available GPUs and
-uses `torchrun` for multi-GPU training. You can manually specify the number of GPUs using the `--nproc-per-node` option:
-
-```bash
-automodel --nproc-per-node 2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-If you don't specify `--nproc-per-node`, it will use all available GPUs on your system.
-
-Looking for Slurm or cloud training? See [Slurm](./slurm.md) or [SkyPilot](./skypilot.md).
-
-## Run with uv (Development Mode)
-
-When you need more control over the environment or are actively developing with the codebase, you can use `uv` to run training scripts directly. This approach gives you direct access to the underlying Python scripts and is ideal for debugging or customization.
-
-### Train on a Single GPU
-
-```bash
-uv run nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Train on Multiple GPUs with Torchrun (Single Node)
-
-For multi-GPU single-node training, use `torchrun` directly:
-
-```bash
-uv run torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Why Use uv?
-
-uv provides several advantages for development and experimentation:
-
-- **Automatic environment management**: uv automatically creates and manages virtual environments, ensuring consistent dependencies without manual setup.
-- **Lock file synchronization**: Keeps your local environment perfectly synchronized with the project's `uv.lock` file.
-- **No installation required**: Run scripts directly from the repository without installing packages system-wide.
-- **Development flexibility**: Direct access to Python scripts for debugging, profiling, and customization.
-- **Dependency isolation**: Each project gets its own isolated environment, preventing conflicts.
-
-## Run with Torchrun
-
-If you have NeMo AutoModel installed in your environment and prefer to run recipes directly without uv, you can use `torchrun` directly:
-
-### Train on a Single GPU
-
-```bash
-python nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-### Train on Multiple GPUs (Single Node)
-
-```bash
-torchrun --nproc-per-node=2 nemo_automodel/recipes/llm/train_ft.py -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```
-
-This approach requires that you have already installed NeMo AutoModel and its dependencies in your Python environment (see the [installation guide](../guides/installation.md) for details).
-
-## Customize Configuration Settings
-
-All approaches use the same YAML configuration files. You can easily customize training by following the steps in this section.
-
-1. **Override config values**: Use command-line arguments to directly replace default settings.
-For example, if you want to fine-tune `Qwen/Qwen3-0.6B` instead of `meta-llama/Llama-3.2-1B`, you can use:
-   ```bash
-   automodel config.yaml --model.pretrained_model_name_or_path Qwen/Qwen3-0.6B
-   ```
-
-2. **Edit the config file**: Modify the YAML directly for persistent changes.
-
-3. **Create custom configs**: Copy and modify existing configurations from the `examples/` directory.
-
-## When to Use Which Approach
-
-**Use the AutoModel CLI when:**
-- You want a simple, unified interface
-- You are running locally on a single machine
-- You don't need to modify the underlying code
-- You prefer a higher-level abstraction
-
-**Use uv when:**
-- You're developing or debugging the codebase
-- You want automatic dependency management
-- You need maximum control over the execution
-- You want to avoid manual environment setup
-- You're experimenting with custom modifications
-
-**Use Torchrun when:**
-- You have a stable, pre-configured environment
-- You prefer explicit control over Python execution
-- You're working in environments where uv is not available
-- You're integrating with existing PyTorch workflows
-
-All approaches use the same configuration files and provide the same training capabilities on a single node. For multi-node training, see [Run on a Cluster](./slurm.md).
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/launcher/slurm.md
-```md
-# Run on a Cluster
-
-In this guide, you will learn how to submit distributed training jobs on Slurm clusters (single- or multi-node). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md). For setup details, refer to our [Installation Guide](../guides/installation.md).
-
-NeMo AutoModel uses recipes to run end-to-end workflows. If you're new to recipes, see the [Repository Structure](../repository-structure.md) guide.
-
-
-## Quickstart
-
-```bash
-# Edit the reference script for your cluster, then submit:
-cp slurm.sub my_cluster.sub
-vim my_cluster.sub
-sbatch my_cluster.sub
-```
-
-For interactive testing on a Slurm node:
-  - Single node, single GPU
-    ```bash
-    automodel your_config.yaml
-    ```
-  - Single node, multiple GPUs
-    ```bash
-    automodel --nproc-per-node 8 your_config.yaml
-    ```
-
-## Submit a Batch Job with Slurm
-
-SLURM clusters vary widely: some use Pyxis containers, others use
-Singularity/Apptainer, and many run bare-metal with environment modules.
-Instead of trying to cover all variations in code, AutoModel provides a
-reference sbatch script that you copy and adapt to your cluster.
-
-### Getting Started
-
-1. Copy the reference script:
-
-```bash
-cp slurm.sub my_cluster.sub
-```
-
-2. Edit `my_cluster.sub` — change `CONFIG`, `#SBATCH` directives (account,
-   partition, nodes, time), container runtime, mounts, and secrets for your
-   cluster.
-
-3. Submit the job:
-
-```bash
-sbatch my_cluster.sub
-```
-
-### How It Works
-
-The reference `slurm.sub` script:
-
-1. Sets `CONFIG` to point at your YAML recipe config
-2. Allocates nodes via SBATCH directives
-3. Sets up the multi-node environment (`MASTER_ADDR`, `MASTER_PORT`)
-4. Runs `torchrun -m nemo_automodel.cli.app $CONFIG` on each node via `srun`
-5. Each torchrun worker detects the distributed environment and runs the recipe in-process
-
-All cluster-specific configuration (SBATCH directives, container runtime,
-mounts, NCCL tuning, secrets) lives in your sbatch script where you can see
-and edit it directly.
-
-
-### Examples
-
-**Pyxis container (NVIDIA clusters):**
-
-```bash
-#!/bin/bash
-#SBATCH -A my_account
-#SBATCH -p batch
-#SBATCH -t 01:00:00
-#SBATCH -N 8
-#SBATCH --gpus-per-node=8
-#SBATCH --ntasks-per-node=1
-#SBATCH -J automodel-finetune
-#SBATCH --output=slurm_jobs/%x_%j.out
-#SBATCH --error=slurm_jobs/%x_%j.err
-
-CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-
-CONT=/lustre/fsw/images/automodel.sqsh
-CONT_NAME=automodel-training
-CONT_MOUNT="\
-/home/$USER/Automodel:/opt/Automodel,\
-/home/$USER/.cache/huggingface:/root/.cache/huggingface"
-
-export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
-export MASTER_PORT=13742
-
-srun \
-    --container-name="${CONT_NAME}" \
-    --container-image="${CONT}" \
-    --container-mounts="${CONT_MOUNT}" \
-    --container-entrypoint \
-    --no-container-mount-home \
-    --export=ALL \
-    bash -c "\
-        cd /opt/Automodel && \
-        torchrun \
-            --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \
-            --nnodes=\${SLURM_NNODES:-1} \
-            --rdzv_backend=c10d \
-            --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-            -m nemo_automodel.cli.app ${CONFIG}"
-```
-
-**Bare-metal (no container):**
-
-```bash
-#!/bin/bash
-#SBATCH -A my_account
-#SBATCH -p gpu
-#SBATCH -N 2
-#SBATCH --gpus-per-node=8
-#SBATCH --time=01:00:00
-
-CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=13742
-
-module load cuda/12.8
-source /opt/venvs/automodel/bin/activate
-
-srun bash -c "\
-    torchrun \
-        --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \
-        --nnodes=\${SLURM_NNODES:-1} \
-        --rdzv_backend=c10d \
-        --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-        -m nemo_automodel.cli.app ${CONFIG}"
-```
-
-**Apptainer / Singularity:**
-
-```bash
-#!/bin/bash
-#SBATCH -A my_account
-#SBATCH -p gpu
-#SBATCH -N 2
-#SBATCH --gpus-per-node=8
-#SBATCH --time=01:00:00
-
-CONFIG=examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-
-export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
-export MASTER_PORT=13742
-
-srun apptainer exec --nv /shared/images/automodel.sif \
-    bash -c "\
-        torchrun \
-            --nproc-per-node=\${SLURM_GPUS_PER_NODE:-8} \
-            --nnodes=\${SLURM_NNODES:-1} \
-            --rdzv_backend=c10d \
-            --rdzv_endpoint=\${MASTER_ADDR}:\${MASTER_PORT} \
-            -m nemo_automodel.cli.app ${CONFIG}"
-```
-
-
-### Launch with Modified Code
-
-If the script is executed from within a Git repository accessible to Slurm
-workers, automodel will use the repository source over the installation
-inside the container image (it prepends `$CWD` to `PYTHONPATH` when it
-detects an editable checkout).
-
-```bash
-git clone git@github.com:NVIDIA-NeMo/Automodel.git automodel_test_repo
-cd automodel_test_repo/
-sbatch slurm.sub
-```
-
-## Customize Configuration Settings
-
-You can customize training by following the steps in this section.
-
-1. **Override config values**: Edit the `CONFIG` variable and add CLI overrides
-   in your torchrun command inside the sbatch script. For example, to change
-   the model:
-   ```bash
-   -m nemo_automodel.cli.app ${CONFIG} --model.pretrained_model_name_or_path Qwen/Qwen3-0.6B
-   ```
-
-2. **Edit the config file**: Modify the YAML directly for persistent changes.
-
-3. **Create custom configs**: Copy and modify existing configurations from the `examples/` directory.
-
-For single-node workflows, see our [Run on Your Local Workstation](./local-workstation.md) guide.
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/launcher/nemo-run.md
-```md
-# Run with NeMo-Run
-
-In this guide, you will learn how to launch NeMo AutoModel training jobs using [NeMo-Run](https://github.com/NVIDIA/NeMo-Run). NeMo-Run supports multiple backends including Slurm, Kubernetes, Docker, and local execution. For cloud-based training, see [Run on Any Cloud with SkyPilot](./skypilot.md). For direct sbatch usage, see [Run on a Cluster (Slurm)](./slurm.md). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md).
-
-NeMo-Run is an open-source tool from NVIDIA that manages job submission across different execution backends. You define your compute configuration once in a Python file and reuse it across all your training jobs.
-
-## Before You Begin
-
-1. **Install NeMo-Run** (it is not bundled with AutoModel):
-
-```bash
-pip install nemo-run
-```
-
-2. **Create an executor definitions file** at `$NEMORUN_HOME/executors.py`. `NEMORUN_HOME` defaults to `~/.nemo_run`; set the environment variable to use a different location. This file tells NeMo-Run how to reach your compute target. Every executor you reference in a YAML config must be defined here. See [Executor Setup](#executor-setup) for a complete example.
-
-3. **Verify connectivity** to the target in your executor (e.g. SSH for Slurm, kubeconfig for Kubernetes).
-
-4. **Set required environment variables** (if needed by your training config):
-
-```bash
-export HF_TOKEN=hf_...          # Required for gated models (e.g. Llama)
-export WANDB_API_KEY=...        # Optional: Weights & Biases logging
-```
-
-## Executor Setup
-
-The `executor:` field in your YAML config is a name that maps to an entry in `$NEMORUN_HOME/executors.py`. This file must define a module-level `EXECUTOR_MAP` dictionary. NeMo-Run supports several executor types -- here are examples of the most common ones:
-
-### Slurm Executor
-
-```python
-import nemo_run as run
-
-def my_slurm_cluster():
-    executor = run.SlurmExecutor(
-        account="my_account",
-        partition="batch",
-        tunnel=run.SSHTunnel(
-            user="myuser",
-            host="login-node.example.com",
-            job_dir="/remote/path/nemo_run/jobs",
-        ),
-        nodes=1,
-        ntasks_per_node=8,
-        gpus_per_node=8,
-        mem="0",
-        exclusive=True,
-        packager=run.Packager(),
-    )
-    executor.container_image = "nvcr.io/nvidia/nemo-automodel:26.02"
-    executor.container_mounts = ["/data:/data", "/checkpoints:/checkpoints"]
-    executor.env_vars = {"HF_HOME": "/data/hf_cache"}
-    executor.time = "04:00:00"
-    return executor
-
-EXECUTOR_MAP = {
-    "my_slurm": my_slurm_cluster(),
-}
-```
-
-### Kubernetes Executor
-
-```python
-import nemo_run as run
-
-def my_k8s_cluster():
-    return run.KubeflowExecutor(
-        namespace="training",
-        image="nvcr.io/nvidia/nemo-automodel:26.02",
-        num_nodes=1,
-        nprocs_per_node=8,
-        gpus_per_node=8,
-    )
-
-EXECUTOR_MAP = {
-    "my_k8s": my_k8s_cluster(),
-}
-```
-
-### Multiple Executors
-
-You can define as many executors as you need for different backends, clusters, or resource configurations:
-
-```python
-EXECUTOR_MAP = {
-    "slurm_dev": my_slurm_dev(),
-    "slurm_prod": my_slurm_prod(),
-    "k8s": my_k8s_cluster(),
-}
-```
-
-- Keys in `EXECUTOR_MAP` are names you reference in YAML (`executor: slurm_dev`).
-- Values can be executor instances or zero-argument callables that return one.
-- Override fields in the YAML (`nodes`, `devices`, `container_image`, etc.) are applied on top of the executor defaults.
-
-## Quickstart
-
-Any existing AutoModel YAML config can be run via NeMo-Run by adding a `nemo_run:` section at the top. For example, given an existing config that you run locally:
-
-```bash
-automodel examples/llm_finetune/qwen/qwen3_moe_30b_te_packed_sequence.yaml
-```
-
-Add a `nemo_run:` block to submit it to a remote executor instead:
-
-```yaml
-# -- Add this section to any existing config ----------------------------------
-nemo_run:
-  executor: my_slurm             # Name from EXECUTOR_MAP in $NEMORUN_HOME/executors.py
-  container_image: /images/custom.sqsh  # Override executor's default image
-  nodes: 1                       # Override number of nodes
-  ntasks_per_node: 8             # GPUs per node
-  time: "04:00:00"               # Override time limit
-  job_name: qwen3_moe_finetune   # Experiment and job name
-
-# -- Everything below is your existing training config (unchanged) ------------
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 32
-  # ... rest of your config ...
-```
-
-Then run the same command:
-
-```bash
-automodel your_config.yaml
-```
-
-The CLI detects the `nemo_run:` key, strips it from the training config, loads the named executor from `$NEMORUN_HOME/executors.py`, and submits the job -- all in one command.
-
-## Configuration Reference
-
-### All `nemo_run:` Fields
-
-| Field | Default | Description |
-|---|---|---|
-| `executor` | `"local"` | Name from `EXECUTOR_MAP` in `$NEMORUN_HOME/executors.py`, or `"local"` for local execution |
-| `job_name` | `<recipe_class_name>` | Experiment and job name |
-| `detach` | `true` | Return immediately after submission |
-| `tail_logs` | `false` | Stream logs after submission |
-| `executors_file` | `$NEMORUN_HOME/executors.py` | Path to the executor definitions file |
-| `job_dir` | `./nemo_run_jobs` | Local directory for job artifacts (config snapshot) |
-| *(any other key)* | *(from executor)* | Applied directly to the executor via `setattr`. Use the executor's native attribute names (e.g. `nodes`, `ntasks_per_node`, `partition`, `container_image`, `time`, `env_vars`). Dicts are merged, lists are extended. |
-
-## Examples
-
-### Single-Node Fine-Tuning (1 x 8 GPUs)
-
-```yaml
-nemo_run:
-  executor: my_slurm
-  nodes: 1
-  ntasks_per_node: 8
-  job_name: single_node_finetune
-```
-
-### Multi-Node Distributed Training (2 x 8 GPUs)
-
-```yaml
-nemo_run:
-  executor: my_slurm
-  nodes: 2
-  ntasks_per_node: 8
-  time: "08:00:00"
-  job_name: multinode_pretrain
-```
-
-For multi-node jobs the launcher automatically adds `--nnodes`, `--node-rank`, `--rdzv-backend`, `--master-addr`, and `--master-port` to the `torchrun` command.
-
-### Custom Container Image and Mounts
-
-```yaml
-nemo_run:
-  executor: my_slurm
-  container_image: /images/automodel_nightly.sqsh
-  container_mounts:
-    - /scratch/datasets:/datasets
-    - /scratch/checkpoints:/checkpoints
-  env_vars:
-    HF_HOME: /datasets/hf_cache
-    NCCL_DEBUG: INFO
-```
-
-### Local Execution (No Cluster)
-
-Use `executor: local` to run on the current machine. No `$NEMORUN_HOME/executors.py` entry is needed:
-
-```yaml
-nemo_run:
-  executor: local
-  ntasks_per_node: 2
-  job_name: local_test
-```
-
-## Monitor and Manage Jobs
-
-NeMo-Run stores experiment metadata under `$NEMORUN_HOME/experiments/`. Set `tail_logs: true` in the YAML to stream job output after submission.
-
-For Slurm-based executors, standard Slurm commands also work:
-
-```bash
-squeue -u $USER                 # List your queued and running jobs
-scancel <job_id>                # Cancel a running or pending job
-sacct -j <job_id>               # View job accounting information
-```
-
-For Kubernetes-based executors, use `kubectl` to monitor pods and jobs.
-
-## How It Works
-
-1. The `automodel` CLI detects the `nemo_run:` key and imports `NemoRunLauncher`.
-2. The `nemo_run:` section is popped from the config. The remaining training config is written to `nemo_run_jobs/<timestamp>/job_config.yaml` for record-keeping.
-3. The launcher loads a pre-configured executor from `$NEMORUN_HOME/executors.py` by name (or creates a `LocalExecutor` for `executor: local`). Override fields are applied on top of the executor defaults.
-4. The training config YAML is embedded in a self-contained inline bash script via a heredoc, so no separate file transfer is needed.
-5. A `torchrun` command is built with `--nproc-per-node` and (for multi-node) distributed rendezvous arguments.
-6. The script is submitted via `nemo_run.Experiment`. By default the call returns immediately (`detach=True`).
-
-## Customize Configuration
-
-Override any training parameter from the command line, same as with local runs:
-
-```bash
-automodel config_with_nemo_run.yaml \
-  --model.pretrained_model_name_or_path meta-llama/Llama-3.2-3B
-```
-
-## When to Use NeMo-Run vs. SkyPilot vs. Slurm
-
-| | NeMo-Run | SkyPilot | Slurm (sbatch) |
-|---|---|---|---|
-| **Infrastructure** | Slurm, Kubernetes, Docker, local | Public cloud (AWS, GCP, Azure) | On-prem HPC |
-| **Container support** | Yes (Pyxis/Enroot, Docker, K8s pods) | N/A (cloud VMs) | Manual (in sbatch script) |
-| **Setup required** | `nemo-run` + `$NEMORUN_HOME/executors.py` | Cloud credentials + `sky check` | Cluster access + sbatch script |
-| **Job submission** | `automodel config.yaml` | `automodel config.yaml` | `sbatch slurm.sub` |
-| **Good for** | Managed multi-backend execution, reusable executor configs | Cloud burst, cost optimization, spot instances | Direct Slurm scripts, full control over sbatch |
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/launcher/skypilot.md
-```md
-# Run on Any Cloud with SkyPilot
-
-In this guide, you will learn how to launch NeMo AutoModel training jobs on any major cloud provider (AWS, GCP, Azure, Lambda, Kubernetes) using [SkyPilot](https://skypilot.readthedocs.io). For on-premises cluster usage, see [Run on a Cluster (Slurm)](./slurm.md). For single-node workstation usage, see [Run on Your Local Workstation](./local-workstation.md).
-
-SkyPilot is an open-source framework that abstracts cloud infrastructure so you can train on whichever cloud is cheapest or most available at launch time — including automatic spot-instance handling for significant cost savings.
-
-## Before You Begin
-
-Complete the following setup steps before launching your first AutoModel job on a cloud provider.
-
-1. **Install SkyPilot** with the connector for your target cloud:
-
-```bash
-pip install "skypilot[gcp]"      # Google Cloud
-pip install "skypilot[aws]"      # Amazon Web Services
-pip install "skypilot[azure]"    # Microsoft Azure
-pip install "skypilot[lambda]"   # Lambda Cloud
-pip install "skypilot[kubernetes]"  # Any Kubernetes cluster
-```
-
-2. **Configure your cloud credentials** by following the SkyPilot credential setup guide for your cloud, then verify:
-
-```bash
-sky check
-```
-
-You should see at least one cloud listed as **OK**.
-
-3. **Set required environment variables:**
-
-```bash
-export HF_TOKEN=hf_...          # Required for gated models (e.g. Llama)
-export WANDB_API_KEY=...        # Optional: Weights & Biases logging
-```
-
-## Quickstart
-
-Add a `skypilot:` section to any existing config YAML, then run the same `automodel` command you already know:
-
-```bash
-automodel finetune llm -c your_config_with_skypilot.yaml
-```
-
-The CLI detects the `skypilot:` key, strips it from the training config, uploads the code and config to a cloud VM, and launches training — all in one command.
-
-## Configuration Reference
-
-Below is an annotated example for fine-tuning Llama-3.2-1B on SQuAD on a GCP spot T4. A ready-to-run copy lives at [`examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml`](../../examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml).
-
-```yaml
-# ── SkyPilot launcher section ─────────────────────────────────────────────
-# Removed before the training config reaches the remote VM.
-skypilot:
-  cloud: gcp                  # aws | gcp | azure | lambda | kubernetes
-  accelerators: T4:1          # GPU type:count per node, e.g. A100:8
-  use_spot: true              # ~80 % cost reduction vs on-demand
-  disk_size: 100              # Remote VM disk size in GB
-  num_nodes: 1                # Increase for multi-node distributed training
-  region: us-central1         # Optional — SkyPilot picks cheapest if omitted
-  job_name: llama3_2_finetune # Also used as the SkyPilot cluster name
-
-  # Use env-var placeholders so secrets are never stored in YAML
-  hf_token: ${HF_TOKEN}
-  # wandb_key: ${WANDB_API_KEY}
-
-  # Optional: extra shell commands run on the VM after `pip install -e .`
-  # setup: |
-  #   pip install some-extra-dependency
-
-  # Optional: override the default output directory (default: ./skypilot_jobs)
-  # job_dir: /path/to/skypilot/jobs
-
-# ── Training config (forwarded to the VM unchanged) ───────────────────────
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  num_epochs: 1
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-# ... rest of your training config ...
-```
-
-### All `skypilot:` Fields
-
-| Field | Default | Description |
-|---|---|---|
-| `cloud` | *(required)* | Cloud provider: `aws`, `gcp`, `azure`, `lambda`, `kubernetes` |
-| `accelerators` | `T4:1` | GPU type and count per node, e.g. `A100:8`, `V100:4` |
-| `num_nodes` | `1` | Number of VMs for distributed training |
-| `use_spot` | `true` | Use spot/preemptible instances |
-| `disk_size` | `100` | Remote VM disk size in GB |
-| `region` | *(auto)* | Cloud region; SkyPilot selects cheapest if omitted |
-| `zone` | *(auto)* | Availability zone within the region |
-| `instance_type` | *(auto)* | Specific instance type; auto-selected if omitted |
-| `job_name` | `<domain>_<command>` | Job and SkyPilot cluster name |
-| `setup` | *(auto)* | Extra setup commands run after `pip install -e .` |
-| `hf_home` | `~/.cache/huggingface` | Hugging Face cache directory on the remote VM |
-| `hf_token` | `$HF_TOKEN` env | Hugging Face token for gated model access |
-| `wandb_key` | `$WANDB_API_KEY` env | Weights & Biases API key |
-| `env_vars` | `{}` | Additional environment variables for the remote VM |
-| `job_dir` | `./skypilot_jobs` | Local directory for job artifacts (config snapshot, logs) |
-| `gpus_per_node` | *(parsed from `accelerators`)* | Override GPU count per node passed to `torchrun` |
-
-## Cloud Examples
-
-### AWS — On-Demand A10G
-
-```yaml
-skypilot:
-  cloud: aws
-  accelerators: A10G:1
-  use_spot: false
-  region: us-east-1
-  job_name: llm_aws_finetune
-  hf_token: ${HF_TOKEN}
-```
-
-### GCP — spot V100, 8 GPUs (single node)
-
-```yaml
-skypilot:
-  cloud: gcp
-  accelerators: V100:8
-  use_spot: true
-  region: us-west1
-  job_name: llm_gcp_v100_8gpu
-  hf_token: ${HF_TOKEN}
-```
-
-### Multi-node distributed training (2 × 8 × A100)
-
-```yaml
-skypilot:
-  cloud: gcp
-  accelerators: A100:8
-  num_nodes: 2
-  use_spot: false
-  job_name: llm_multinode_a100
-  hf_token: ${HF_TOKEN}
-```
-
-For multi-node jobs the launcher automatically adds the SkyPilot rendezvous environment variables (`$SKYPILOT_NODE_RANK`, `$SKYPILOT_NUM_NODES`, `$SKYPILOT_NODE_IPS`) to the `torchrun` command.
-
-## Monitor and Manage Jobs
-
-After submitting, use standard SkyPilot commands:
-
-```bash
-sky status                    # List running clusters and their status
-sky logs <cluster_name>       # Stream training logs
-sky ssh <cluster_name>        # SSH into the VM for debugging
-sky cancel <cluster_name> <job_id>  # Cancel a running job
-sky down <cluster_name>       # Terminate the cluster and stop billing
-```
-
-## How It Works
-
-1. The `automodel` CLI detects the `skypilot:` key in the YAML and calls `launch_with_skypilot()`.
-2. The training config (with `skypilot:` removed) is written to a local `skypilot_jobs/<timestamp>/job_config.yaml`.
-3. A `sky.Task` is created with:
-   - **workdir** — the current directory synced to `~/sky_workdir` on the remote VM.
-   - **file_mounts** — the job config uploaded to `/tmp/automodel_job_config.yaml`.
-   - **setup** — `pip install -e .` (plus any custom `setup:` commands).
-   - **run** — a `torchrun` command pointing at the recipe script and config.
-4. `sky.launch()` provisions the VM, runs setup, then executes training. The call returns immediately (`detach_run=True`); use `sky logs` to follow progress.
-
-## Customize Configuration
-
-Override any training parameter from the command line, same as with local runs:
-
-```bash
-automodel finetune llm -c config_with_skypilot.yaml \
-  --model.pretrained_model_name_or_path meta-llama/Llama-3.2-3B
-```
-
-## When to Use SkyPilot vs. Slurm
-
-| | SkyPilot | Slurm |
-|---|---|---|
-| **Infrastructure** | Any public cloud | On-premises HPC cluster |
-| **Spot instances** | Yes (automatic) | Depends on cluster config |
-| **Setup required** | Cloud credentials + `sky check` | Cluster access |
-| **Good for** | Flexible cloud burst, cost optimization | Fixed on-prem GPU clusters |
-
-```
-
-File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# To run this recipe:
-#   automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml --nproc-per-node 8
-# Adjust --nproc-per-node to the number of GPUs available on your machine.
-
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  ckpt_every_steps: 1000
-  val_every_steps: 10  # will run every x number of gradient steps
-  num_epochs: 1
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111
-  ranked: true
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-# torch.compile configuration
-compile:
-  enabled: false
-  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
-  fullgraph: false
-  dynamic: true  # Set to false for better performance with fixed shapes
-  backend: null  # Use default backend (inductor)
-
-clip_grad_norm:
-  max_norm: 1.0
-
-distributed:
-  strategy: fsdp2
-  dp_size: none
-  tp_size: 1
-  cp_size: 1
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-packed_sequence:
-  packed_sequence_size: 0
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-  shuffle: false
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-  limit_dataset_samples: 64
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-
-optimizer:
-  _target_: torch.optim.Adam
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0
-  # min_lr: 1.0e-5
-
-lr_scheduler:
-  lr_decay_style: cosine
-  min_lr: 1.0e-6
-
-# Uncomment and configure for W&B logging
-# wandb:
-#   project: <your_wandb_project>
-#   entity: <your_wandb_entity>
-#   name: <your_wandb_exp_name>
-#   save_dir: <your_wandb_save_dir>
-
-# Uncomment and configure for Mlflow logging
-# mlflow:
-#   experiment_name: "automodel-llm-llama3_2_1b_squad-finetune"
-#   run_name: ""
-#   tracking_uri: null
-#   artifact_location: null 
-#   tags:
-#     task: "squad-finetune"
-#     model_family: "llama3.2"
-#     model_size: "1b"
-#     dataset: "squad"
-#     framework: "automodel"
-
-ci:
-  recipe_owner: akoumpa
-
-```
-
-File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Fine-tune Llama-3.2-1B on SQuAD using SkyPilot for cloud execution.
-#
-# Usage:
-#   automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_skypilot.yaml
-#
-# Prerequisites:
-#   pip install "skypilot[gcp]"   # or [aws], [azure], etc.
-#   sky check                     # verify cloud credentials
-#
-# Monitor:
-#   sky status
-#   sky logs <cluster_name>
-
-# ---------------------------------------------------------------------------
-# SkyPilot launcher config (removed before the job config reaches the VM)
-# ---------------------------------------------------------------------------
-skypilot:
-  cloud: gcp                  # aws | gcp | azure | lambda | kubernetes
-  accelerators: T4:1          # GPU type:count per node
-  use_spot: true              # ~80 % cost reduction vs on-demand
-  disk_size: 100              # GB
-  num_nodes: 1
-  region: us-central1         # optional; SkyPilot picks cheapest if omitted
-  job_name: llama3_2_1b_squad
-
-  # Credentials – use env-var placeholders so secrets are never stored in YAML.
-  hf_token: ${HF_TOKEN}
-  # wandb_key: ${WANDB_API_KEY}
-
-  # Extra setup commands run on the VM after `pip install -e .`
-  # setup: |
-  #   pip install some-extra-dependency
-
-# ---------------------------------------------------------------------------
-# Training config (forwarded to the VM unchanged)
-# ---------------------------------------------------------------------------
-recipe:
-  _target_: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  ckpt_every_steps: 1000
-  val_every_steps: 10
-  num_epochs: 1
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111
-  ranked: true
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-compile:
-  enabled: false
-  mode: "default"
-  fullgraph: false
-  dynamic: true
-  backend: null
-
-clip_grad_norm:
-  max_norm: 1.0
-
-distributed:
-  strategy: fsdp2
-  dp_size: none
-  tp_size: 1
-  cp_size: 1
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-packed_sequence:
-  packed_sequence_size: 0
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-  shuffle: false
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-  limit_dataset_samples: 64
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-
-optimizer:
-  _target_: torch.optim.Adam
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0
-
-lr_scheduler:
-  lr_decay_style: cosine
-  min_lr: 1.0e-6
-
-ci:
-  recipe_owner: adil-a
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/automodel-sft-peft-core.txt b/skills/nemotron-customize/context/automodel-sft-peft-core.txt
deleted file mode 100644
index 5e2f02727..000000000
--- a/skills/nemotron-customize/context/automodel-sft-peft-core.txt
+++ /dev/null
@@ -1,6696 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Automodel
-├── docs
-│   ├── guides
-│   │   ├── llm
-│   │   │   ├── dataset.md *
-│   │   │   └── finetune.md *
-│   │   ├── diffusion
-│   │   ├── omni
-│   │   ├── vlm
-│   │   ├── checkpointing.md *
-│   │   └── dataset-overview.md *
-│   ├── about
-│   ├── launcher
-│   └── model-coverage
-│       ├── diffusion
-│       │   ├── black-forest-labs
-│       │   ├── hunyuanvideo-community
-│       │   └── wan-ai
-│       ├── llm
-│       │   ├── allenai
-│       │   ├── baai
-│       │   ├── baichuan-inc
-│       │   ├── bigcode
-│       │   ├── bytedance-seed
-│       │   ├── cohere
-│       │   ├── deepseek-ai
-│       │   ├── eleutherai
-│       │   ├── google
-│       │   ├── ibm
-│       │   ├── inceptionai
-│       │   ├── internlm
-│       │   ├── lgai-exaone
-│       │   ├── meta
-│       │   ├── microsoft
-│       │   ├── minimax
-│       │   ├── mistralai
-│       │   ├── moonshotai
-│       │   ├── nvidia
-│       │   ├── openai
-│       │   ├── openbmb
-│       │   ├── orionstar
-│       │   ├── parasail-ai
-│       │   ├── qwen
-│       │   ├── stabilityai
-│       │   ├── stepfun-ai
-│       │   ├── thudm
-│       │   ├── tiiuae
-│       │   └── upstage
-│       ├── omni
-│       │   ├── microsoft
-│       │   └── qwen
-│       └── vlm
-│           ├── google
-│           ├── huggingface
-│           ├── internlm
-│           ├── llava-hf
-│           ├── meta
-│           ├── mistralai
-│           ├── moonshotai
-│           ├── nvidia
-│           └── qwen
-├── examples
-│   ├── llm_finetune
-│   │   ├── llama3_1
-│   │   │   └── llama3_1_8b_columnmapped_lora.yaml *
-│   │   ├── llama3_2
-│   │   │   ├── llama3_2_1b_squad.yaml *
-│   │   │   └── llama3_2_1b_squad_peft.yaml *
-│   │   ├── baichuan
-│   │   ├── cohere
-│   │   ├── deepseek_v32
-│   │   ├── devstral
-│   │   ├── falcon
-│   │   ├── gemma
-│   │   ├── glm
-│   │   ├── gpt_oss
-│   │   ├── granite
-│   │   ├── llama3_3
-│   │   ├── minimax_m2
-│   │   ├── mistral
-│   │   ├── moonlight
-│   │   ├── nemotron
-│   │   ├── nemotron_flash
-│   │   ├── olmo
-│   │   ├── phi
-│   │   ├── qwen
-│   │   ├── seed
-│   │   ├── starcoder
-│   │   └── stepfun
-│   ├── convergence
-│   │   └── tulu3
-│   │       ├── data
-│   │       ├── eval
-│   │       ├── inference
-│   │       ├── model-verification
-│   │       ├── models
-│   │       │   ├── gpt-oss-20b
-│   │       │   │   └── assets
-│   │       │   ├── moonlight-16b
-│   │       │   │   └── assets
-│   │       │   ├── qwen3-4b
-│   │       │   │   └── assets
-│   │       │   └── qwen3-moe-30b
-│   │       │       ├── assets
-│   │       │       └── experiments
-│   │       └── training
-│   ├── diffusion
-│   │   ├── finetune
-│   │   ├── generate
-│   │   │   └── configs
-│   │   └── pretrain
-│   ├── dllm_generate
-│   ├── dllm_sft
-│   ├── llm_benchmark
-│   │   ├── deepseek
-│   │   ├── glm
-│   │   ├── gpt_oss
-│   │   ├── kimi
-│   │   ├── llama3_3
-│   │   ├── minimax
-│   │   ├── mistral
-│   │   ├── moonlight
-│   │   ├── nemotron
-│   │   ├── qwen
-│   │   └── step
-│   ├── llm_kd
-│   │   └── llama3_2
-│   ├── llm_pretrain
-│   ├── llm_seq_cls
-│   │   └── glue
-│   ├── retrieval
-│   │   ├── bi_encoder
-│   │   │   └── llama_embed_nemotron_8b
-│   │   ├── cross_encoder
-│   │   └── data_utils
-│   ├── vlm_benchmark
-│   │   ├── kimi
-│   │   ├── mistral
-│   │   └── qwen
-│   ├── vlm_finetune
-│   │   ├── gemma3
-│   │   ├── gemma3n
-│   │   ├── gemma4
-│   │   ├── internvl
-│   │   ├── kimi
-│   │   ├── mistral
-│   │   ├── mistral4
-│   │   ├── nemotron
-│   │   ├── phi4
-│   │   ├── qwen2_5
-│   │   ├── qwen3
-│   │   ├── qwen3_5
-│   │   └── qwen3_5_moe
-│   └── vlm_generate
-├── nemo_automodel
-│   ├── components
-│   │   ├── _peft
-│   │   │   ├── lora.py * +
-│   │   │   └── module_matcher.py * +
-│   │   ├── datasets
-│   │   │   ├── llm
-│   │   │   │   ├── megatron
-│   │   │   │   ├── chat_dataset.py * +
-│   │   │   │   └── column_mapped_text_instruction_dataset.py * +
-│   │   │   ├── diffusion
-│   │   │   ├── dllm
-│   │   │   ├── vlm
-│   │   │   └── utils.py * +
-│   │   ├── attention
-│   │   ├── checkpoint
-│   │   │   └── _backports
-│   │   ├── config
-│   │   ├── distributed
-│   │   │   └── pipelining
-│   │   ├── flow_matching
-│   │   │   └── adapters
-│   │   ├── launcher
-│   │   │   ├── nemo_run
-│   │   │   └── skypilot
-│   │   ├── loggers
-│   │   ├── loss
-│   │   │   └── triton
-│   │   ├── models
-│   │   │   ├── baichuan
-│   │   │   ├── common
-│   │   │   ├── deepseek_v3
-│   │   │   ├── deepseek_v32
-│   │   │   ├── gemma4_moe
-│   │   │   ├── glm4_moe
-│   │   │   ├── glm4_moe_lite
-│   │   │   ├── glm_moe_dsa
-│   │   │   ├── gpt_oss
-│   │   │   ├── kimi_k25_vl
-│   │   │   ├── kimivl
-│   │   │   ├── llama
-│   │   │   ├── llama_bidirectional
-│   │   │   ├── minimax_m2
-│   │   │   ├── mistral3
-│   │   │   ├── mistral4
-│   │   │   ├── nemotron_parse
-│   │   │   ├── nemotron_v3
-│   │   │   ├── qwen2
-│   │   │   ├── qwen3_5_moe
-│   │   │   ├── qwen3_moe
-│   │   │   ├── qwen3_next
-│   │   │   ├── qwen3_omni_moe
-│   │   │   ├── qwen3_vl_moe
-│   │   │   └── step3p5
-│   │   ├── moe
-│   │   │   ├── megatron
-│   │   │   └── uccl_ep
-│   │   ├── optim
-│   │   ├── quantization
-│   │   ├── training
-│   │   └── utils
-│   ├── recipes
-│   │   ├── llm
-│   │   │   └── train_ft.py * +
-│   │   ├── diffusion
-│   │   ├── dllm
-│   │   ├── retrieval
-│   │   └── vlm
-│   ├── _diffusers
-│   ├── _transformers
-│   │   └── tokenization
-│   ├── autonvtx
-│   ├── cli
-│   └── shared
-├── .github
-│   ├── actions
-│   │   ├── build-container
-│   │   └── test-template
-│   └── workflows
-│       └── config
-├── docker
-│   └── common
-├── scripts
-├── skills
-│   ├── .claude
-│   │   └── skills
-│   │       ├── developer-guide
-│   │       ├── distributed-training
-│   │       ├── launcher-config
-│   │       ├── model-onboarding
-│   │       ├── parity-testing
-│   │       └── recipe-development
-│   ├── developer-guide
-│   ├── distributed-training
-│   ├── launcher-config
-│   ├── model-onboarding
-│   ├── parity-testing
-│   └── recipe-development
-├── tests
-│   ├── ci_tests
-│   │   ├── configs
-│   │   │   ├── llm_benchmark
-│   │   │   ├── llm_finetune
-│   │   │   ├── vlm_benchmark
-│   │   │   └── vlm_finetune
-│   │   ├── golden_values
-│   │   │   ├── llm_finetune
-│   │   │   │   ├── baichuan
-│   │   │   │   ├── falcon
-│   │   │   │   ├── gemma
-│   │   │   │   ├── glm
-│   │   │   │   ├── gpt_oss
-│   │   │   │   ├── granite
-│   │   │   │   ├── llama3_1
-│   │   │   │   ├── llama3_2
-│   │   │   │   ├── mistral
-│   │   │   │   ├── moonlight
-│   │   │   │   ├── nemotron
-│   │   │   │   ├── nemotron_flash
-│   │   │   │   ├── olmo
-│   │   │   │   ├── phi
-│   │   │   │   ├── qwen
-│   │   │   │   ├── seed
-│   │   │   │   └── starcoder
-│   │   │   └── vlm_finetune
-│   │   │       ├── gemma3
-│   │   │       ├── gemma3n
-│   │   │       ├── internvl
-│   │   │       ├── mistral
-│   │   │       ├── nemotron
-│   │   │       ├── qwen2_5
-│   │   │       ├── qwen3
-│   │   │       └── qwen3_5_moe
-│   │   ├── scripts
-│   │   └── utils
-│   ├── functional_tests
-│   │   ├── checkpoint
-│   │   ├── checkpoint_robustness
-│   │   ├── context_parallel
-│   │   ├── data
-│   │   │   └── llm
-│   │   ├── datasets
-│   │   │   └── llm
-│   │   ├── hf_dcp
-│   │   ├── hf_peft
-│   │   ├── hf_transformer
-│   │   ├── hf_transformer_finetune
-│   │   ├── hf_transformer_llm
-│   │   ├── hf_transformer_vlm
-│   │   ├── llm_pretrain_and_kd
-│   │   │   ├── customizer_retrieval
-│   │   │   ├── llm_seq_cls
-│   │   │   └── loss
-│   │   ├── retrieval
-│   │   └── training
-│   ├── unit_tests
-│   │   ├── _cli
-│   │   ├── _diffusers
-│   │   ├── _peft
-│   │   ├── _transformers
-│   │   ├── attention
-│   │   ├── checkpoint
-│   │   ├── components
-│   │   │   └── training
-│   │   ├── config
-│   │   ├── datasets
-│   │   │   ├── diffusion
-│   │   │   ├── dllm
-│   │   │   ├── llm
-│   │   │   └── vlm
-│   │   ├── diffusion_processors
-│   │   ├── distributed
-│   │   │   └── pipelining
-│   │   ├── flow_matching
-│   │   │   └── adapters
-│   │   ├── launcher
-│   │   ├── loggers
-│   │   ├── loss
-│   │   ├── models
-│   │   │   ├── baichuan
-│   │   │   ├── bi_encoder
-│   │   │   ├── common
-│   │   │   ├── deepseek_v3
-│   │   │   ├── deepseek_v32
-│   │   │   ├── gemma4
-│   │   │   ├── glm4_moe
-│   │   │   ├── glm4_moe_lite
-│   │   │   ├── glm_moe_dsa
-│   │   │   ├── gpt_oss
-│   │   │   ├── kimi_k25_vl
-│   │   │   ├── kimivl
-│   │   │   ├── llama
-│   │   │   ├── minimax_m2
-│   │   │   ├── mistral3
-│   │   │   ├── mistral4
-│   │   │   ├── nemotron_parse
-│   │   │   ├── nemotron_v3
-│   │   │   ├── qwen2
-│   │   │   ├── qwen3_5
-│   │   │   ├── qwen3_5_moe
-│   │   │   ├── qwen3_moe
-│   │   │   ├── qwen3_next
-│   │   │   ├── qwen3_omni_moe
-│   │   │   ├── qwen3_vl_moe
-│   │   │   └── step3p5
-│   │   ├── moe
-│   │   ├── optim
-│   │   ├── quantization
-│   │   ├── recipes
-│   │   │   ├── dllm
-│   │   │   └── llm
-│   │   ├── shared
-│   │   ├── tools
-│   │   ├── training
-│   │   └── utils
-│   └── utils
-├── tools
-│   └── diffusion
-│       ├── data
-│       └── processors
-└── tutorials
-    └── nemotron-parse
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; selected files shown.
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Automodel/nemo_automodel/recipes/llm/train_ft.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import inspect
-import logging
-import pathlib
-import time
-from contextlib import nullcontext
-from typing import TYPE_CHECKING, Any, Dict, Optional
-
-import torch
-import torch.nn as nn
-import wandb
-from huggingface_hub import constants as hf_constants
-from torch.utils.data import DataLoader, IterableDataset
-from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
-from torchdata.stateful_dataloader.sampler import StatefulDistributedSampler
-from transformers import AutoConfig
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from wandb import Settings
-
-from nemo_automodel._transformers import NeMoAutoModelForCausalLM, NeMoAutoModelForSequenceClassification
-from nemo_automodel._transformers.auto_tokenizer import NeMoAutoTokenizer
-from nemo_automodel._transformers.infrastructure import (
-    apply_model_infrastructure,
-    instantiate_infrastructure,
-)
-from nemo_automodel._transformers.mfu import AutoMFU
-from nemo_automodel._transformers.utils import apply_cache_compatibility_patches
-from nemo_automodel.components.checkpoint.checkpointing import (
-    Checkpointer,
-    CheckpointingConfig,
-)
-from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
-from nemo_automodel.components.datasets.llm.megatron.sampler import create_megatron_sampler
-from nemo_automodel.components.datasets.llm.megatron_dataset import MegatronPretraining
-from nemo_automodel.components.datasets.llm.packed_sequence import pack_dataset
-from nemo_automodel.components.distributed.config import MegatronFSDPConfig
-from nemo_automodel.components.distributed.cp_utils import make_cp_batch_and_ctx
-from nemo_automodel.components.distributed.init_utils import (
-    initialize_distributed,
-)
-from nemo_automodel.components.distributed.megatron_fsdp import fully_shard_optimizer
-from nemo_automodel.components.distributed.mesh import MeshContext
-from nemo_automodel.components.distributed.pipelining import AutoPipeline
-from nemo_automodel.components.distributed.utils import FirstRankPerNode, get_sync_ctx
-from nemo_automodel.components.loggers.comet_utils import build_comet
-from nemo_automodel.components.loggers.log_utils import setup_logging
-from nemo_automodel.components.loggers.metric_logger import MetricsSample, build_metric_logger
-from nemo_automodel.components.loggers.mlflow_utils import build_mlflow
-from nemo_automodel.components.loggers.wandb_utils import suppress_wandb_log_messages
-from nemo_automodel.components.loss.linear_ce import FusedLinearCrossEntropy
-from nemo_automodel.components.loss.masked_ce import MaskedCrossEntropy
-from nemo_automodel.components.moe.megatron.moe_utils import MoEAuxLossAutoScaler
-from nemo_automodel.components.optim.scheduler import OptimizerParamScheduler
-from nemo_automodel.components.optim.utils import build_dion_optimizer, is_dion_optimizer
-from nemo_automodel.components.quantization.fp8 import build_fp8_config
-from nemo_automodel.components.training.model_output_utils import get_final_hidden_states
-from nemo_automodel.components.training.rng import ScopedRNG, StatefulRNG
-from nemo_automodel.components.training.step_scheduler import StepScheduler
-from nemo_automodel.components.training.utils import (
-    count_tail_padding,
-    prepare_after_first_microbatch,
-    prepare_for_final_backward,
-    prepare_for_grad_accumulation,
-    scale_grads_and_clip_grad_norm,
-)
-from nemo_automodel.components.utils.compile_utils import (
-    build_compile_config,
-)
-from nemo_automodel.components.utils.flops_utils import calculate_mfu
-from nemo_automodel.components.utils.model_utils import (
-    _supports_logits_to_keep,
-    _supports_seq_lens,
-    filter_forward_kwargs,
-    resolve_trust_remote_code,
-)
-from nemo_automodel.recipes._dist_setup import setup_distributed
-from nemo_automodel.recipes.base_recipe import BaseRecipe
-from nemo_automodel.shared.te_patches import apply_te_patches
-from nemo_automodel.shared.utils import dtype_from_str
-
-if TYPE_CHECKING:
-    from torch.optim import Optimizer
-
-    from nemo_automodel.components.distributed.init_utils import DistInfo
-
-logger = logging.getLogger(__name__)
-
-
-# ---------------------------
-#  Stateless helper functions
-# ---------------------------
-def _get_model_name(cfg_model):
-    if cfg_model.get("pretrained_model_name_or_path", None) is not None:
-        return cfg_model.pretrained_model_name_or_path
-    elif cfg_model.get("config", None) is not None:
-        if isinstance(cfg_model.config, str):
-            return cfg_model.config
-        return cfg_model.config.get("pretrained_model_name_or_path", None)
-    else:
-        return None
-
-
-def _uses_te_dot_product_attention(model_or_cfg):
-    """Check whether the model uses TE DotProductAttention.
-
-    Accepts either an instantiated nn.Module (preferred — inspects actual modules)
-    or a config object (fallback — checks backend.attn string).
-    """
-    if isinstance(model_or_cfg, torch.nn.Module):
-        try:
-            from transformer_engine.pytorch.attention import DotProductAttention
-        except ImportError:
-            return False
-        return any(isinstance(m, DotProductAttention) for m in model_or_cfg.modules())
-    # Config fallback for call sites before model is built
-    return (
-        hasattr(model_or_cfg, "backend") and hasattr(model_or_cfg.backend, "attn") and model_or_cfg.backend.attn == "te"
-    )
-
-
-def _uses_thd_collater(cfg_dataloader):
-    from nemo_automodel.components.datasets.utils import packed_sequence_thd_collater
-
-    return (
-        True
-        if hasattr(cfg_dataloader, "collate_fn") and cfg_dataloader.collate_fn == packed_sequence_thd_collater
-        else False
-    )
-
-
-def _get_num_thd_chunks(pp_enabled, cfg):
-    if pp_enabled:
-        return cfg.step_scheduler.local_batch_size // cfg.get("distributed.pipeline.pp_microbatch_size", 1)
-    return 1
-
-
-def build_model(
-    cfg_model,
-    cfg_peft,
-    seed,
-    has_packed_sequence=False,
-    cfg_fp8=None,
-    cfg_compile=None,
-    cfg_quantization=None,
-    device_mesh=None,
-    moe_mesh=None,
-    distributed_config=None,
-    pipeline_config=None,
-    cfg_qat=None,
-    cfg_moe=None,
-    activation_checkpointing=False,
-    unfreeze_modules: list[str] | None = None,
-    sdpa_method: list[str] | None = None,
-) -> tuple[nn.Module | AutoPipeline, list["Optimizer"]]:  # noqa: F821
-    """Build and initialize a model.
-
-    Args:
-        cfg_model: Configuration for model instantiation.
-        cfg_peft: Configuration for PEFT.
-        seed: Random seed.
-        has_packed_sequence: Whether using packed sequences.
-        cfg_fp8: Configuration for FP8.
-        cfg_compile: Configuration for torch.compile.
-        cfg_quantization: Configuration for BitsAndBytes quantization.
-        device_mesh: Device mesh for distributed training.
-        moe_mesh: MOE mesh for expert parallelism.
-        distributed_config: Strategy-specific distributed config (FSDP2Config, etc.).
-        pipeline_config: Pipeline parallelism config.
-        cfg_qat: Configuration for QAT (will be instantiated to QATConfig).
-        cfg_moe: MoEParallelizerConfig instance, or ConfigNode to be converted.
-        activation_checkpointing: Whether to enable activation checkpointing.
-        unfreeze_modules: List of module names/substrings to unfreeze.
-        sdpa_method: Explicit list of SDPA backend name strings (e.g.
-            ``["flash_attention", "efficient_attention"]``), or ``None`` to
-            auto-select based on CP / activation checkpointing.
-    """
-    with ScopedRNG(seed=seed, ranked=True):
-        kwargs = {
-            "has_packed_sequence": has_packed_sequence,
-            "peft_config": cfg_peft,
-            "device_mesh": device_mesh,
-            "moe_mesh": moe_mesh,
-            "distributed_config": distributed_config,
-            "pipeline_config": pipeline_config,
-            "sdpa_method": sdpa_method,
-        }
-
-        if cfg_qat is not None and cfg_qat.get("enabled", False):
-            if cfg_peft is not None:
-                raise ValueError("QAT with PEFT is not currently supported")
-            qat_config_attr = getattr(cfg_qat, "qat_config", None)
-            if qat_config_attr is not None:
-                kwargs["qat_config"] = qat_config_attr.instantiate()
-            else:
-                # Fallback to legacy quantizer format for backward compatibility
-                quantizer_attr = getattr(cfg_qat, "quantizer", None)
-                if quantizer_attr is not None:
-                    kwargs["qat_config"] = quantizer_attr.instantiate()
-
-        if cfg_moe is not None:
-            from nemo_automodel.components.moe.config import MoEParallelizerConfig
-
-            if isinstance(cfg_moe, MoEParallelizerConfig):
-                kwargs["moe_config"] = cfg_moe
-            else:
-                moe_dict = cfg_moe.to_dict() if hasattr(cfg_moe, "to_dict") else dict(cfg_moe)
-                # activation_checkpointing is handled separately; strip config keys
-                moe_dict.pop("activation_checkpointing", None)
-                moe_dict.pop("_target_", None)
-                kwargs["moe_config"] = MoEParallelizerConfig(**moe_dict)
-            kwargs["activation_checkpointing"] = activation_checkpointing
-
-        if cfg_fp8 is not None:
-            kwargs["fp8_config"] = build_fp8_config(cfg_fp8)
-        if cfg_compile is not None:
-            kwargs["compile_config"] = build_compile_config(cfg_compile)
-        if cfg_quantization is not None:
-            logger.info("Model weight quantization enabled with BitsAndBytes")
-            from nemo_automodel.components.quantization.qlora import create_bnb_config
-
-            kwargs["quantization_config"] = create_bnb_config(cfg_quantization)
-
-        is_nemo_auto_model = cfg_model.get("_target_", None) in (
-            NeMoAutoModelForCausalLM.from_config,
-            NeMoAutoModelForCausalLM.from_pretrained,
-            NeMoAutoModelForSequenceClassification.from_config,
-            NeMoAutoModelForSequenceClassification.from_pretrained,
-        )
-
-        if is_nemo_auto_model:
-            # NeMoAutoModel handles infrastructure internally
-            model = cfg_model.instantiate(**kwargs)
-        else:
-            # For non-NemoAutoModel entry points (e.g., build_gpt2_model),
-            # instantiate the model first, then apply infrastructure separately.
-            # Note: sdpa_method is not supported here — SDPA patching only runs
-            # inside NeMoAutoModel._build_model.
-            if sdpa_method is not None:
-                logger.warning("sdpa_method is ignored for non-NeMoAutoModel targets.")
-            # We must convert config objects into runtime objects (model_wrapper,
-            # autopipeline, parallelize_fn, etc.) via instantiate_infrastructure,
-            # exactly as from_pretrained/from_config do internally.
-            model = cfg_model.instantiate()
-
-            mesh = MeshContext.from_meshes(device_mesh, moe_mesh)
-            model_wrapper, autopipeline, parallelize_fn, qat_quantizer = instantiate_infrastructure(
-                distributed_config=distributed_config,
-                pipeline_config=pipeline_config,
-                qat_config=kwargs.get("qat_config"),
-                moe_config=kwargs.get("moe_config"),
-                activation_checkpointing=kwargs.get("activation_checkpointing", False),
-                device=torch.device("cuda", torch.cuda.current_device()),
-                mesh=mesh,
-            )
-            loss_fn = pipeline_config.loss_fn if pipeline_config is not None else None
-
-            model = apply_model_infrastructure(
-                model,
-                is_meta_device=False,
-                device=torch.cuda.current_device(),
-                mesh=mesh,
-                model_wrapper=model_wrapper,
-                autopipeline=autopipeline,
-                parallelize_fn=parallelize_fn,
-                qat_quantizer=qat_quantizer,
-                loss_fn=loss_fn,
-                peft_config=kwargs.get("peft_config"),
-                fp8_config=kwargs.get("fp8_config"),
-                compile_config=kwargs.get("compile_config"),
-                quantization_config=kwargs.get("quantization_config"),
-                pretrained_model_name_or_path=None,
-                load_base_model=False,
-                cache_dir=hf_constants.HF_HUB_CACHE,
-            )
-
-    # Explicitly unfreeze specified modules (e.g. task heads) that need full fine-tuning
-    if unfreeze_modules:
-        for name, param in model.named_parameters():
-            if any(module_name in name for module_name in unfreeze_modules):
-                param.requires_grad_(True)
-        logging.info(f"Unfroze parameters matching: {unfreeze_modules}")
-
-    return model
-
-
-def build_optimizer(model, cfg_opt, distributed_config, device_mesh):
-    """Build an optimizer for the model.
-
-    Args:
-        model: The model to build an optimizer for.
-        cfg_opt: The configuration for the optimizer.
-        distributed_config: The distributed configuration.
-        device_mesh: The device mesh.
-    """
-    # Resolve dtype strings (e.g. "torch.bfloat16") to torch.dtype objects for
-    # optimizers like TE FusedAdam that accept dtype kwargs.
-    for attr in ("master_weight_dtype", "exp_avg_dtype", "exp_avg_sq_dtype"):
-        val = getattr(cfg_opt, attr, None)
-        if isinstance(val, str):
-            setattr(cfg_opt, attr, dtype_from_str(val))
-
-    if device_mesh is not None and "tp" in device_mesh.mesh_dim_names and device_mesh["tp"].size() > 1:
-        # TP does not support foreach
-        cfg_opt.foreach = False
-
-    optimizer = []
-    has_dion_optimizer = is_dion_optimizer(cfg_opt)
-    for part in getattr(model, "parts", [model]):
-        trainable_params = list(filter(lambda x: x.requires_grad, part.parameters()))
-        assert len(trainable_params) > 0, "trainable_params cannot be empty"
-        # TODO(@akoumparouli): no branching for building the optimizer, refactor.
-        if has_dion_optimizer:
-            tmp_optimizer = build_dion_optimizer(
-                cfg_opt=cfg_opt,
-                model=part,
-                distributed_mesh=device_mesh,
-            )
-        else:
-            tmp_optimizer = cfg_opt.instantiate(params=trainable_params)
-        if isinstance(distributed_config, MegatronFSDPConfig) and torch.distributed.get_world_size() > 1:
-            assert not has_dion_optimizer, "Dion optimizer does not support fully_shard_optimizer"
-            tmp_optimizer = fully_shard_optimizer(part, tmp_optimizer)
-        optimizer.append(tmp_optimizer)
-
-    return optimizer
-
-
-def build_checkpoint_config(cfg_ckpt, cache_dir, model_repo_id, is_peft) -> CheckpointingConfig:
-    """Build a checkpoint configuration.
-
-    Args:
-        cfg_ckpt: Configuration for checkpointing.
-        cache_dir: Cache directory for the model.
-        model_repo_id: Model repository ID.
-        is_peft: Whether the model is PEFT.
-        state_dict_keys: Copy of the model state dict keys before any parallelization.
-
-    Returns:
-        The instantiated checkpoint configuration.
-    """
-
-    ckpt_kwargs = dict(
-        enabled=True,
-        checkpoint_dir="checkpoints/",
-        model_save_format="safetensors",
-        model_repo_id=model_repo_id,
-        model_cache_dir=cache_dir if cache_dir is not None else hf_constants.HF_HUB_CACHE,
-        save_consolidated=True,
-        is_peft=is_peft,
-    )
-    if cfg_ckpt is not None:
-        cfg_ckpt = cfg_ckpt.to_dict()
-        cfg_ckpt.pop("restore_from", None)
-        ckpt_kwargs |= cfg_ckpt
-    if ckpt_kwargs.get("is_peft", False) and ckpt_kwargs.get("model_save_format") == "torch_save":
-        raise ValueError(
-            "PEFT checkpointing is not supported for torch_save format. Save using `safetensors` format instead."
-        )
-    checkpoint_config = CheckpointingConfig(**ckpt_kwargs)
-    return checkpoint_config
-
-
-def build_loss_fn(cfg_loss):
-    """Build a loss function.
-
-    Args:
-        cfg_loss (ConfigNode): Loss function configuration.
-
-    Returns:
-        The instantiated loss function on the specified device.
-    """
-    return cfg_loss.instantiate()
-
-
-def compute_trust_remote_code_from_model(cfg_model):
-    """Compute the value of trust_remote_code based on the model configuration.
-
-    Args:
-        cfg_model (ConfigNode): Model configuration.
-
-    Returns:
-        bool: Whether to trust remote code.
-    """
-    if hasattr(cfg_model, "trust_remote_code"):
-        return getattr(cfg_model, "trust_remote_code")
-    elif hasattr(cfg_model, "config") and hasattr(cfg_model.config, "trust_remote_code"):
-        return getattr(cfg_model.config, "trust_remote_code")
-    return resolve_trust_remote_code(_get_model_name(cfg_model))
-
-
-def _build_tokenizer(cfg_model, cfg_ds):
-    trust_remote_code = compute_trust_remote_code_from_model(cfg_model)
-    # if tokenizer is not provided, use the model config to instantiate it
-    if "tokenizer" not in cfg_ds and _get_model_name(cfg_model) is not None:
-        logging.info("Using model config to instantiate tokenizer")
-        tokenizer = NeMoAutoTokenizer.from_pretrained(_get_model_name(cfg_model), trust_remote_code=trust_remote_code)
-    elif cfg_ds.get("tokenizer", None) is None:
-        tokenizer = None
-    elif "_target_" not in cfg_ds.tokenizer:
-        tokenizer_dict = cfg_ds.tokenizer.to_dict()
-        trust_remote_code = tokenizer_dict.pop("trust_remote_code", trust_remote_code)
-        tokenizer = NeMoAutoTokenizer.from_pretrained(**tokenizer_dict, trust_remote_code=trust_remote_code)
-    else:
-        trust_remote_code = cfg_ds.tokenizer.to_dict().pop("trust_remote_code", trust_remote_code)
-        tokenizer = cfg_ds.tokenizer.instantiate(trust_remote_code=trust_remote_code)
-
-    # Finally, check if the dataset target accepts a tokenizer parameter
-    kwargs = {}
-    if tokenizer is not None and callable(cfg_ds._target_):
-        try:
-            sig = inspect.signature(cfg_ds._target_)
-            if "tokenizer" in sig.parameters:
-                kwargs["tokenizer"] = tokenizer
-        except (ValueError, TypeError):
-            # If we can't get the signature, skip adding tokenizer
-            pass
-    return kwargs, tokenizer
-
-
-def build_dataloader(
-    cfg_ds,
-    cfg_dl,
-    cfg_model,
-    cfg_ps,
-    seed,
-    local_batch_size,
-    global_batch_size,
-    max_steps,
-    val_check_interval,
-    dp_rank,
-    dp_world_size,
-    pp_enabled,
-    cp_size=1,
-    model: Optional[nn.Module] = None,
-) -> tuple[DataLoader, PreTrainedTokenizerBase]:
-    """Build a DataLoader for the dataset.
-
-    Args:
-        cfg_ds: Dataset configuration.
-        cfg_dl: DataLoader configuration.
-        cfg_model: Model configuration.
-        cfg_ps: Packed sequence configuration.
-        seed: Random seed.
-        local_batch_size: Local batch size.
-        global_batch_size: Global batch size.
-        max_steps: Maximum number of steps.
-        val_check_interval: Validation check interval.
-        dp_rank: Data parallel rank.
-        dp_world_size: Data parallel world size.
-        pp_enabled: Whether pipeline parallelism is enabled.
-        cp_size: Context parallel size.
-        model: Optional model instance. If provided and packed sequences are enabled,
-            seq_lens will only be included if the model's forward() accepts it.
-    Returns:
-        The instantiated DataLoader and tokenizer.
-    """
-    with ScopedRNG(seed=seed, ranked=True):
-        kwargs, tokenizer = _build_tokenizer(cfg_model, cfg_ds)
-        # Megatron specific kwargs
-        if cfg_ds._target_ == MegatronPretraining:
-            kwargs["global_batch_size"] = global_batch_size
-            kwargs["trainer_max_steps"] = max_steps if max_steps is not None else None
-            kwargs["trainer_val_check_interval"] = val_check_interval
-            ds = cfg_ds.instantiate(**kwargs)
-            ds.build()
-        else:
-            with FirstRankPerNode():
-                ds = cfg_ds.instantiate(**kwargs)
-
-        # If using an IterableDataset, per-rank sharding for unique samples
-        if isinstance(ds, IterableDataset):
-            if callable(getattr(ds, "shard", None)):
-                ds = ds.shard(dp_world_size, dp_rank)
-                logging.info(f"Sharded IterableDataset via dataset.shard: world_size={dp_world_size}, rank={dp_rank}")
-            elif hasattr(ds, "dataset"):
-                # HuggingFace streaming datasets: split by file shards when possible.
-                from datasets.distributed import split_dataset_by_node
-
-                assert hasattr(ds, "dataset"), "dataset must have a dataset attribute"
-                ds.dataset = split_dataset_by_node(ds.dataset, world_size=dp_world_size, rank=dp_rank)
-                logging.info(f"Sharded dataset via split_dataset_by_node: world_size={dp_world_size}")
-            else:
-                logging.warning("IterableDataset does not support sharding; Data may be duplicated across ranks.")
-
-        packed_sequence_size = getattr(cfg_ps, "packed_sequence_size", 0)
-        packing_strategy = getattr(cfg_ps, "packing_strategy", "thd")
-
-        # check if packed sequence is supported (only for thd strategy)
-        supports_seq_lens = _supports_seq_lens(model)
-        if packed_sequence_size > 0 and packing_strategy == "thd" and not supports_seq_lens:
-            logging.warning("Packed sequence is not supported without seq_lens; disabling packed sequence")
-            packed_sequence_size = 0
-
-        # Apply packing if configured
-        if packed_sequence_size > 0:
-            logger.info(f"Packing dataset with size: {packed_sequence_size}, strategy: {packing_strategy}")
-            if hasattr(ds, "shuffle"):
-                ds = ds.shuffle(seed)
-
-            if packing_strategy == "neat":
-                from nemo_automodel.components.datasets.llm.neat_packing import neat_pack_dataset
-                from nemo_automodel.components.datasets.utils import neat_packed_collater
-                from nemo_automodel.components.models.common.packing import configure_packing, get_attn_implementation
-
-                ds = neat_pack_dataset(
-                    ds,
-                    split=cfg_ds.split,
-                    pack_size=packed_sequence_size,
-                    max_packs=getattr(cfg_ps, "max_packs", None),
-                    padding_idx=getattr(tokenizer, "pad_token_id", 0),
-                    drop_long_samples=getattr(cfg_ps, "drop_long_samples", False),
-                )
-                _attn_impl = get_attn_implementation(cfg_model)
-                configure_packing(attn_implementation=_attn_impl)
-                # Set collater with attn_implementation so it produces the right mask format
-                cfg_dl.collate_fn = lambda batch, _ai=_attn_impl: neat_packed_collater(batch, attn_implementation=_ai)
-                logger.info(f"Configured neat packing for attn_implementation={_attn_impl}")
-            else:
-                # "thd" — existing packing logic
-                ds = pack_dataset(
-                    ds,
-                    split=cfg_ds.split,
-                    packed_sequence_size=packed_sequence_size,
-                    max_packs=getattr(cfg_ps, "max_packs", None),
-                    padding_idx=getattr(tokenizer, "pad_token_id", 0),
-                    cp_size=cp_size,
-                )
-
-        if isinstance(ds, MegatronPretraining):
-            ds = ds.get_dataset(split=cfg_ds.splits_to_build)
-            dataloader_type = cfg_dl.get("dataloader_type", "single")
-            if "dataloader_type" in cfg_dl:
-                del cfg_dl.dataloader_type
-            batch_sampler = create_megatron_sampler(
-                dataset_len=len(ds),
-                micro_batch_size=local_batch_size,
-                global_batch_size=global_batch_size,
-                dataloader_type=dataloader_type,
-                rank=dp_rank,
-                world_size=dp_world_size,
-            )
-            dl_kwargs = {"batch_sampler": batch_sampler}
-        elif not isinstance(ds, IterableDataset):
-            shuffle = cfg_dl.get("shuffle", True)
-            if "shuffle" in cfg_dl:
-                del cfg_dl.shuffle
-
-            group_by_length = cfg_dl.get("group_by_length", False)
-            if "group_by_length" in cfg_dl:
-                del cfg_dl.group_by_length
-
-            if group_by_length:
-                from nemo_automodel.components.datasets.llm.length_grouped_sampler import (
-                    LengthGroupedSampler as LLMLengthGroupedSampler,
-                )
-
-                sampler = LLMLengthGroupedSampler(
-                    dataset=ds,
-                    batch_size=local_batch_size,
-                    seed=seed,
-                    num_replicas=dp_world_size,
-                    rank=dp_rank,
-                )
-            else:
-                dist_sampler_kwargs = {
-                    "num_replicas": dp_world_size,
-                    "rank": dp_rank,
-                    "shuffle": shuffle,
-                }
-                sampler = StatefulDistributedSampler(
-                    ds,
-                    seed=seed,
-                    drop_last=True,
-                    **dist_sampler_kwargs,
-                )
-            dl_kwargs = {"sampler": sampler, "batch_size": local_batch_size}
-            if pp_enabled:
-                dl_kwargs["drop_last"] = True
-        else:
-            logging.info("Using IterableDataset; skipping sampler.")
-            # Optional shuffle for streaming IterableDataset (uses HF dataset shuffle if available)
-            shuffle = cfg_dl.get("shuffle", False)
-            shuffle_buffer_size = cfg_dl.get("shuffle_buffer_size", 10000)
-            # Do not pass shuffle-related kwargs to the DataLoader when using IterableDataset
-            # But leave them in dl config to be consistent
-            if hasattr(cfg_dl, "shuffle"):
-                del cfg_dl.shuffle
-            if hasattr(cfg_dl, "shuffle_buffer_size"):
-                del cfg_dl.shuffle_buffer_size
-
-            if shuffle and hasattr(ds, "shuffle"):
-                try:
-                    ds = ds.shuffle(buffer_size=shuffle_buffer_size, seed=seed)
-                    logging.info(f"Shuffling IterableDataset with buffer_size={shuffle_buffer_size}, seed={seed}")
-                except Exception as e:
-                    logging.warning(f"IterableDataset shuffle skipped due to error: {e}")
-            dl_kwargs = {}
-
-        # Handle collate_fn with optional mask precomputation for pipeline parallelism
-        dl_kwargs = dl_kwargs | {"dataset": ds}
-
-        # Handle collate_fn instantiation if it's a ConfigNode
-        if hasattr(cfg_dl, "collate_fn"):
-            if hasattr(cfg_dl.collate_fn, "_target_"):
-                collate_cfg = cfg_dl.collate_fn
-                dl_kwargs["collate_fn"] = lambda batch: collate_cfg.instantiate(batch=batch)
-            else:
-                dl_kwargs["collate_fn"] = cfg_dl.collate_fn
-            assert callable(dl_kwargs["collate_fn"]), "collate_fn must be callable"
-
-        # Chain with mask precomputation if PP is enabled
-        if pp_enabled:
-            from nemo_automodel.components.datasets.utils import add_causal_masks_to_batch
-
-            try:
-                hf_model_config = AutoConfig.from_pretrained(
-                    _get_model_name(cfg_model), trust_remote_code=compute_trust_remote_code_from_model(cfg_model)
-                )
-            except Exception:
-                logger.warning(
-                    "Failed to load model config for causal mask precomputation. "
-                    "Pipeline parallel mask precomputation will be skipped."
-                )
-            else:
-                if "collate_fn" in dl_kwargs:
-                    # Case 1: PP enabled + collate_fn exists -> chain them
-                    # base_collate_fn -> add_causal_masks_to_batch
-                    base_collate_fn = dl_kwargs["collate_fn"]
-
-                    def chained_collate_fn(batch, base_fn=base_collate_fn, config=hf_model_config):
-                        batch = base_fn(batch)  # Apply base collate (padding, batching, etc.)
-                        batch = add_causal_masks_to_batch(batch, model_config=config)  # Add masks
-                        return batch
-
-                    dl_kwargs["collate_fn"] = chained_collate_fn
-                else:
-                    # Case 2: PP enabled + no collate_fn -> only add masks
-                    dl_kwargs["collate_fn"] = lambda batch, config=hf_model_config: add_causal_masks_to_batch(
-                        batch, model_config=config
-                    )
-
-        try:
-            import torch.multiprocessing as mp
-
-            if mp.get_start_method(allow_none=True) is None:
-                mp.set_start_method("spawn", force=True)
-        except RuntimeError:
-            pass
-        return cfg_dl.instantiate(**dl_kwargs), tokenizer
-
-
-def build_distributed(cfg_dist: Dict[str, Any]) -> "DistInfo":  # noqa: F821
-    """Build and initialize distributed training resources.
-
-    Args:
-        cfg_dist: Configuration for distributed training.
-
-    Returns:
-        Distributed training information from initialize_distributed.
-    """
-    backend = cfg_dist.get("backend", "nccl")
-    timeout = cfg_dist.get("timeout_minutes", 1)
-    return initialize_distributed(backend=backend, timeout_minutes=timeout)
-
-
-def build_step_scheduler(cfg, dataloader, dp_group_size, local_batch_size):
-    """Build the step scheduler.
-
-    Args:
-        cfg: configuration for the StepScheduler class.
-        dataloader: the training dataloader, used for extracting the epoch_len (in batches).
-        dp_group_size: the size of the data parallel group.
-        micro_batch_size: the size of the micro batch.
-
-    Returns:
-        StepScheduler: the configured StepScheduler.
-    """
-    assert "_target_" not in cfg, "_target_ not permitted in step scheduler"
-    default_kwargs = dict(
-        num_epochs=10,
-        global_batch_size=32,
-        local_batch_size=local_batch_size,
-        dp_size=dp_group_size,
-        ckpt_every_steps=100,
-        dataloader=dataloader,
-    )
-    if cfg is not None:
-        default_kwargs |= cfg.to_dict()
-    return StepScheduler(**default_kwargs)
-
-
-def build_lr_scheduler(cfg, optimizer, step_scheduler) -> list[OptimizerParamScheduler] | None:  # noqa: F821
-    """Build the learning rate scheduler.
-
-    Args:
-        cfg: Configuration for the OptimizerParamScheduler.
-        optimizer: The optimizer to be scheduled.
-        step_scheduler: The step scheduler to extract training parameters.
-
-    Returns:
-        OptimizerParamScheduler: The configured learning rate scheduler, or None if not configured.
-    """
-    if cfg is None:
-        return None
-
-    # Calculate total steps for the training run
-    total_epochs = step_scheduler.num_epochs
-    epoch_len = len(step_scheduler.dataloader)
-    grad_acc_steps = step_scheduler.grad_acc_steps
-
-    # Total optimizer steps (accounting for gradient accumulation)
-    total_steps = (total_epochs * epoch_len) // grad_acc_steps
-    if step_scheduler.max_steps is not None:
-        total_steps = min(total_steps, step_scheduler.max_steps)
-
-    # Set defaults for scheduler parameters
-    optimizer_param_schedulers = []
-    user_kwargs = cfg.to_dict()
-    default_kwargs = dict(
-        lr_warmup_steps=min(1000, total_steps // 10),  # 10% warmup or max 1000 steps
-        lr_decay_steps=total_steps,
-        lr_decay_style="cosine",
-        wd_incr_steps=total_steps,
-        wd_incr_style="constant",
-    )
-
-    if not isinstance(optimizer, list):
-        optimizer = [optimizer]
-
-    for opt in optimizer:
-        base_lr = opt.param_groups[0]["lr"]
-        default_kwargs.update(
-            dict(
-                optimizer=opt,
-                init_lr=base_lr * 0.1,  # Start warmup at 10% of base LR
-                max_lr=base_lr,
-                min_lr=base_lr * 0.01,  # End at 1% of base LR
-                start_wd=opt.param_groups[0].get("weight_decay", 0.0),
-                end_wd=opt.param_groups[0].get("weight_decay", 0.0),
-            )
-        )
-        default_kwargs.update(user_kwargs)
-        optimizer_param_schedulers.append(OptimizerParamScheduler(**default_kwargs))
-
-    logger.info(
-        f"Building LR scheduler with total_steps={total_steps}, "
-        f"warmup_steps={default_kwargs['lr_warmup_steps']}, "
-        f"decay_style={default_kwargs['lr_decay_style']}"
-    )
-
-    return optimizer_param_schedulers
-
-
-def build_wandb(cfg) -> wandb.Run:
-    """Instantiates wandb and returns the instance. If no name is given, it will use the model name.
-
-    Args:
-        cfg: Configuration for wandb.
-
-    Returns:
-        The wandb instance.
-    """
-    assert cfg.get("wandb", None) is not None
-    kwargs = cfg.wandb.to_dict()
-    if kwargs.get("name", "") == "":
-        kwargs["name"] = "_".join(_get_model_name(cfg.model).split("/")[-2:])
-    run = wandb.init(
-        **kwargs,
-        config=cfg.to_dict(),
-        settings=Settings(silent=True),
-    )
-    return run
-
-
-def calculate_loss(loss_fn, **kwargs) -> torch.Tensor:
-    """Calculate the loss.
-
-    Args:
-        loss_fn: Loss function.
-        **kwargs: Keyword arguments for the loss function.
-
-    Returns:
-        The loss.
-    """
-    loss_fn_kwargs = {"num_label_tokens": kwargs.pop("num_label_tokens", None)}
-    if isinstance(loss_fn, FusedLinearCrossEntropy):
-        model = kwargs.pop("model")
-        labels = kwargs.pop("labels")
-
-        # find the lm_head in the model
-        lm_head = None
-        if hasattr(model, "get_output_embeddings"):
-            lm_head = model.get_output_embeddings().weight
-        else:
-            for n, p in model.named_parameters(remove_duplicate=False):
-                if "lm_head" in n and n.endswith(".weight"):
-                    lm_head = p
-                    break
-        if lm_head is None:
-            raise ValueError("lm_head.weight not found in model")
-
-        # unshard the possibly sharded lm_head
-        lm_head = lm_head.full_tensor() if hasattr(lm_head, "full_tensor") else lm_head
-        loss_fn_kwargs.update(
-            {
-                "hidden_states": kwargs.pop("hidden_states"),
-                "labels": labels,
-                "lm_weight": lm_head,
-            }
-        )
-    else:
-        loss_fn_kwargs.update(
-            {
-                "logits": kwargs.pop("logits"),
-                "labels": kwargs.pop("labels"),
-            }
-        )
-
-    return loss_fn(**loss_fn_kwargs)
-
-
-def build_validation_dataloader(cfg, dp_world_size, dp_rank, pp_enabled, model: Optional[nn.Module] = None):
-    def _prepare_val_ds_name(val_ds_name):
-        val_ds_name = val_ds_name.replace("validation_dataset", "")
-        if len(val_ds_name) > 1 and val_ds_name[0] in ("_", "-", "."):
-            val_ds_name = val_ds_name[1:]
-        if val_ds_name == "":
-            val_ds_name = "default"
-        return val_ds_name
-
-    # Build validation dataloader if the config provides it
-    val_dataloaders = {}
-    for val_ds_name in filter(lambda x: x.startswith("validation_dataset"), cfg.to_dict().keys()):
-        val_ds_cfg = cfg.get(val_ds_name, None)
-        val_ds_name = _prepare_val_ds_name(val_ds_name)
-        val_dataloaders[val_ds_name] = build_dataloader(
-            val_ds_cfg,
-            cfg.validation_dataloader,
-            cfg.model,
-            cfg_ps=cfg.get("packed_sequence", None)
-            if _uses_te_dot_product_attention(cfg.model) and _uses_thd_collater(cfg.dataloader)
-            else None,
-            seed=cfg.get("seed", 42),
-            local_batch_size=cfg.get("step_scheduler.local_batch_size", 1),
-            global_batch_size=cfg.get("step_scheduler.global_batch_size", 1),
-            max_steps=cfg.get("step_scheduler.max_steps", None),
-            val_check_interval=cfg.get("step_scheduler.val_every_steps", None),
-            dp_rank=dp_rank,
-            dp_world_size=dp_world_size,
-            pp_enabled=pp_enabled,
-            cp_size=cfg.get("distributed.cp_size", 1),
-            model=model,
-        )[0]
-
-    return val_dataloaders
-
-
-# ---------------------------------------------------------------------------
-#  Trainer class – orchestration only
-# ---------------------------------------------------------------------------
-
-
-class TrainFinetuneRecipeForNextTokenPrediction(BaseRecipe):
-    """Recipe for fine-tuning a model for next-token prediction.
-
-    This class orchestrates training, from setup to main training loop.
-    """
-
-    def __init__(self, cfg):
-        """Initialize the recipe with configuration.
-
-        Args:
-            cfg: Configuration dictionary/object for training.
-        """
-        self.cfg = cfg
-
-    # ------------------ build phase ------------------
-    def setup(self):
-        """Builds all components needed for training/validation/logging/checkpointing/etc.
-
-        This is the last place where self.cfg should be referenced.
-
-        Raises:
-            NotImplemented: Raises if it tries to restore a checkpoint; will be removed.
-        """
-        torch.cuda.reset_peak_memory_stats()
-        self.dist_env = build_distributed(self.cfg.get("dist_env", {}))
-        # setups logging and adds the rankfilter to logging
-        setup_logging()
-
-        apply_cache_compatibility_patches()
-        apply_te_patches()
-        # Set up the stateful random number generator
-        self.rng = StatefulRNG(seed=self.cfg.get("seed", 42), ranked=True)
-        # Enable NVTX patching only when explicitly requested in config
-        self.enable_nvtx = bool(self.cfg.get("nvtx", False))
-
-        self.dist_setup = setup_distributed(self.cfg, world_size=self.dist_env.world_size)
-        self.distributed_config = self.dist_setup.strategy_config
-        self.device_mesh = self.dist_setup.device_mesh
-        self.moe_mesh = self.dist_setup.moe_mesh
-        self.pp_enabled = self.dist_setup.pp_enabled
-        self.pipeline_config = self.dist_setup.pipeline_config
-
-        if self.dist_env.is_main and hasattr(self.cfg, "wandb"):
-            suppress_wandb_log_messages()
-            run = build_wandb(self.cfg)
-            logging.info("🚀 View run at {}".format(run.url))
-
-        self.mlflow_logger = None
-        if self.dist_env.is_main and hasattr(self.cfg, "mlflow"):
-            self.mlflow_logger = build_mlflow(self.cfg)
-            self.mlflow_logger.log_params(self.cfg.to_dict())
-            logging.info("MLflow experiment tracking enabled")
-
-        self.comet_logger = None
-        if self.dist_env.is_main and hasattr(self.cfg, "comet"):
-            self.comet_logger = build_comet(self.cfg)
-            self.comet_logger.log_params(self.cfg.to_dict())
-            logging.info("Comet experiment tracking enabled")
-
-        # Log experiment details on main rank
-        self._log_experiment_details()
-        self._log_library_versions()
-
-        # Build loss_fn (will be set on pipeline_config if PP enabled)
-        self.loss_fn = build_loss_fn(self.cfg.loss_fn)
-
-        # Pipeline runtime fields: override pp_batch_size and pp_microbatch_size
-        if self.pp_enabled:
-            pp_batch_size = self.cfg.step_scheduler.local_batch_size
-            pp_microbatch_size = self.cfg.get("distributed.pipeline.pp_microbatch_size", 1)
-
-            assert pp_batch_size // pp_microbatch_size >= self.dist_setup.pp_size, (
-                f"pp_batch_size {pp_batch_size} // pp_microbatch_size {pp_microbatch_size} must be >= pp_size {self.dist_setup.pp_size}"
-            )
-
-            # THD override logic
-            if (
-                self.dist_setup.cp_size > 1
-                and _uses_te_dot_product_attention(self.cfg.model)
-                and _uses_thd_collater(self.cfg.dataloader)
-            ):
-                pp_microbatch_size = 1
-                pp_batch_size = pp_batch_size // self.cfg.get("distributed.pipeline.pp_microbatch_size", 1)
-                logging.info(
-                    f"Overriding pp_batch_size: {pp_batch_size}, pp_microbatch_size: {pp_microbatch_size} for THD"
-                )
-
-            assert not isinstance(self.distributed_config, MegatronFSDPConfig), (
-                "MegatronFSDPConfig is not supported when pipeline parallelism is enabled"
-            )
-
-            # Update pipeline_config runtime fields
-            self.pipeline_config.pp_batch_size = pp_batch_size
-            self.pipeline_config.pp_microbatch_size = pp_microbatch_size
-            self.pipeline_config.patch_stage_backward_maybe_with_nosync = self.cfg.get(
-                "model.backend.enable_fsdp_optimizations", False
-            )
-            self.pipeline_config.loss_fn = self.loss_fn
-
-            # Infer pp_seq_len from dataset config if not explicitly set
-            if hasattr(self.pipeline_config, "pp_seq_len") and self.pipeline_config.pp_seq_len is None:
-                packed_seq_size = self.cfg.get("packed_sequence.packed_sequence_size", 0)
-                if packed_seq_size > 0:
-                    self.pipeline_config.pp_seq_len = packed_seq_size
-                elif self.cfg.get("dataset.seq_len", None) is not None:
-                    self.pipeline_config.pp_seq_len = self.cfg.dataset.seq_len
-
-        # Build components
-        self.peft_config = None
-        if self.cfg.get("peft", None) is not None:
-            self.peft_config = self.cfg.peft.instantiate()
-
-        # Build checkpoint config
-        checkpoint_config = build_checkpoint_config(
-            self.cfg.get("checkpoint", None),
-            self.cfg.get("model.cache_dir", None),
-            _get_model_name(self.cfg.model),
-            True if self.cfg.get("peft", None) else False,
-        )
-
-        if self.cfg.get("clip_grad_norm.max_norm", None) is not None:
-            self.max_grad_norm = float(self.cfg.clip_grad_norm.max_norm)
-        else:
-            logging.info("No clip_grad_norm.max_norm specified in config, using default value of 1.0")
-            self.max_grad_norm = 1.0
-
-        # Create Checkpointer instance
-        self.checkpointer = Checkpointer(
-            config=checkpoint_config,
-            dp_rank=self._get_dp_rank(include_cp=True),
-            tp_rank=self._get_tp_rank(),
-            pp_rank=self._get_pp_rank(),
-            moe_mesh=self.moe_mesh,
-        )
-
-        # Disable fused RoPE when context parallelism is enabled (cp > 1)
-        if self.dist_setup.cp_size > 1 and self.cfg.get("model.backend.rope_fusion", False):
-            logging.info("Disabling rope_fusion because cp_size=%d > 1", self.dist_setup.cp_size)
-            self.cfg.model.backend.rope_fusion = False
-
-        model = build_model(
-            self.cfg.model,
-            self.peft_config,
-            has_packed_sequence=self.cfg.get("packed_sequence.packed_sequence_size", 0) > 0,
-            seed=self.cfg.get("seed", 42),
-            cfg_fp8=self.cfg.get("fp8", None),
-            cfg_compile=self.cfg.get("compile", None),
-            cfg_quantization=self.cfg.get("quantization", None),
-            device_mesh=self.device_mesh,
-            moe_mesh=self.moe_mesh,
-            distributed_config=self.distributed_config,
-            pipeline_config=self.pipeline_config,
-            cfg_qat=self.cfg.get("qat", None),
-            cfg_moe=self.dist_setup.moe_config,
-            activation_checkpointing=self.dist_setup.activation_checkpointing,
-            sdpa_method=self.cfg.get("sdpa_method", None),
-        )
-        self.optimizer = build_optimizer(model, self.cfg.optimizer, self.distributed_config, self.device_mesh)
-
-        if not _supports_logits_to_keep(model) and not isinstance(self.loss_fn, MaskedCrossEntropy):
-            logger.warning("logits_to_keep not found in model.forward. Using MaskedCrossEntropy instead.")
-            self.loss_fn = MaskedCrossEntropy()
-
-        if isinstance(model, AutoPipeline):
-            self.model_parts = model.parts
-            self.pp = model
-            if self.enable_nvtx:
-                import nemo_automodel.autonvtx as autonvtx
-
-                # Patch each pipeline stage with NVTX profiling
-                for i, part in enumerate(self.model_parts):
-                    autonvtx.patch(part, name=f"PipelineStage_{i}")
-        else:
-            if self.enable_nvtx:
-                import nemo_automodel.autonvtx as autonvtx
-
-                # Patch model with NVTX profiling
-                autonvtx.patch(model, name=model.__class__.__name__)
-            self.model_parts = [model]
-            self.pp = None
-
-        # Extract TE FP8 config from model backend (set after model construction)
-        self.te_fp8 = self.model_parts[0].backend.te_fp8 if hasattr(self.model_parts[0], "backend") else None
-
-        _packed_seq_size = self.cfg.get("packed_sequence.packed_sequence_size", 0)
-        if self.dist_setup.cp_size > 1 and _packed_seq_size > 0:
-            _m = self.model_parts[0]
-            if hasattr(_m, "supports") and not _m.supports_cp_with_sequence_packing:
-                raise ValueError(
-                    f"Context parallelism (cp_size={self.dist_setup.cp_size}) with packed sequences "
-                    f"is not supported for {type(_m).__name__}.\n"
-                    f"Either disable sequence packing:\n"
-                    f"  packed_sequence:\n"
-                    f"    packed_sequence_size: 0\n"
-                    f"or switch to the TE attention backend -- MoE models only:\n"
-                    f"  model:\n"
-                    f"    backend:\n"
-                    f"      attn: te"
-                )
-
-        self.dataloader, self.tokenizer = build_dataloader(
-            self.cfg.dataset,
-            self.cfg.dataloader,
-            self.cfg.model,
-            self.cfg.get("packed_sequence", None),
-            seed=self.cfg.get("seed", 42),
-            local_batch_size=self.cfg.get("step_scheduler.local_batch_size", 1),
-            global_batch_size=self.cfg.get("step_scheduler.global_batch_size", 1),
-            max_steps=self.cfg.get("step_scheduler.max_steps", None),
-            val_check_interval=self.cfg.get("step_scheduler.val_every_steps", None),
-            dp_rank=self._get_dp_rank(),
-            dp_world_size=self._get_dp_group_size(),
-            pp_enabled=self.pp_enabled,
-            cp_size=self.cfg.get("distributed.cp_size", 1),
-            model=self.model_parts[0],
-        )
-        self.val_dataloaders = build_validation_dataloader(
-            self.cfg,
-            self._get_dp_group_size(),
-            self._get_dp_rank(),
-            self.pp_enabled,
-            model=self.model_parts[0],
-        )
-        self.best_metric_key = self.cfg.get("checkpoint.best_metric_key", "default")
-        # Scheduler
-        self.step_scheduler = build_step_scheduler(
-            self.cfg.get("step_scheduler", None),
-            self.dataloader,
-            self._get_dp_group_size(),
-            local_batch_size=self.cfg.get("step_scheduler.local_batch_size", 1),
-        )
-        self._setup_garbage_collection(self.step_scheduler)
-
-        # Build learning rate scheduler
-        self.lr_scheduler = build_lr_scheduler(self.cfg.get("lr_scheduler", None), self.optimizer, self.step_scheduler)
-
-        # Log model, parameter counts, norms, optimizer and scheduler
-        self._log_model_and_optimizer_details(self.model_parts, self.optimizer, self.lr_scheduler)
-
-        # Handle delayed fake-quant toggling for QAT if configured
-        self._qat_disable_fn, self._qat_enable_fn, self._qat_enable_after = self._setup_qat(self.cfg, self.model_parts)
-
-        # Enable MoE load balance tracking if configured
-        moe_metrics_cfg = self.cfg.get("moe_metrics", None)
-        if moe_metrics_cfg and moe_metrics_cfg.get("enabled", False):
-            from nemo_automodel.components.moe.load_balance_metrics import enable_load_balance_tracking
-
-            for mp in self.model_parts:
-                enable_load_balance_tracking(mp)
-
-        self.mfu_calculator = AutoMFU.from_config(self.model_parts[0])
-
-        # NEFTune: noisy embeddings for improved instruction fine-tuning
-        neftune_cfg = self.cfg.get("neftune", None)
-        self.neftune = None
-        if neftune_cfg is not None:
-            from nemo_automodel.components.training.neftune import NEFTune
-
-            noise_alpha = neftune_cfg.get("noise_alpha", 5.0) if hasattr(neftune_cfg, "get") else neftune_cfg
-            self.neftune = NEFTune(noise_alpha=float(noise_alpha))
-            self.neftune.activate(self.model_parts[0])
-
-        restore_from = self.cfg.get("checkpoint.restore_from", None)
-        # Initialize JSONL loggers
-        self.metric_logger_train = build_metric_logger(
-            pathlib.Path(self.checkpointer.config.checkpoint_dir) / "training.jsonl"
-        )
-        self.metric_logger_valid = {
-            name: build_metric_logger(
-                pathlib.Path(self.checkpointer.config.checkpoint_dir)
-                / (f"validation_{name}.jsonl" if name != "default" else "validation.jsonl")
-            )
-            for name in self.val_dataloaders.keys()
-        }
-
-        # Optionally resume
-        self.load_checkpoint(restore_from)
-        torch.cuda.empty_cache()
-
-        # Log step scheduler details
-        self._log_step_scheduler_details(self.step_scheduler)
-
-    def _collect_moe_load_balance(self):
-        """Collect MoE load balance metrics with DP all-reduce.
-
-        Must be called on ALL ranks (the all-reduce is collective).
-        Stores the result in ``self._moe_layer_loads`` for rank-0 logging.
-        """
-        moe_metrics_cfg = self.cfg.get("moe_metrics", None)
-        if not (moe_metrics_cfg and moe_metrics_cfg.get("enabled", False)):
-            self._moe_layer_loads = None
-            return
-
-        from nemo_automodel.components.moe.load_balance_metrics import collect_expert_loads
-
-        dp_group = self._get_dp_group(include_cp=True)
-        all_loads: dict = {}
-        for mp in self.model_parts:
-            all_loads.update(collect_expert_loads(mp, dp_group=dp_group))
-        self._moe_layer_loads = all_loads if all_loads else None
-
-    def _log_moe_metrics(self, step: int, wandb_log_fn) -> None:
-        """Log MoE load balance metrics to wandb.
-
-        Call after :meth:`_collect_moe_load_balance`.  Only logs when
-        ``_moe_layer_loads`` is populated and a wandb log function is provided.
-
-        Args:
-            step: Current training/benchmark step for wandb x-axis.
-            wandb_log_fn: Callable like ``wandb.log`` or ``wandb_run.log``.
-        """
-        if not getattr(self, "_moe_layer_loads", None):
-            return
-
-        from nemo_automodel.components.moe.load_balance_metrics import (
-            compute_brief_metrics,
-            compute_detailed_metrics,
-        )
-
-        moe_metrics_cfg = self.cfg.get("moe_metrics", None)
-        mode = moe_metrics_cfg.get("mode", "brief") if moe_metrics_cfg else "brief"
-        top_k = moe_metrics_cfg.get("top_k_experts", 0) if moe_metrics_cfg else 0
-        if mode == "detailed":
-            detailed_every = moe_metrics_cfg.get("detailed_every_steps", None) if moe_metrics_cfg else None
-            if detailed_every is None or step % detailed_every == 0:
-                wandb_log_fn(compute_detailed_metrics(self._moe_layer_loads, top_k=top_k), step=step)
-            else:
-                wandb_log_fn(compute_brief_metrics(self._moe_layer_loads, top_k=top_k), step=step)
-        else:
-            wandb_log_fn(compute_brief_metrics(self._moe_layer_loads, top_k=top_k), step=step)
-
-    def _setup_qat(self, cfg, model_parts: list[nn.Module]):
-        if not cfg.get("qat.enabled", False):
-            return None, None, None
-        from nemo_automodel.components.quantization.qat import (
-            get_disable_fake_quant_fn,
-            get_enable_fake_quant_fn,
-        )
-
-        qat_cfg = cfg.qat
-        _qat_enable_after = qat_cfg.get("fake_quant_after_n_steps", 0)
-        # Collect mode from any model part that has it
-        qat_mode = getattr(model_parts[0], "_qat_mode", None)
-
-        if qat_mode is None:
-            return None, None, None
-
-        _qat_disable_fn = get_disable_fake_quant_fn(qat_mode)
-        _qat_enable_fn = get_enable_fake_quant_fn(qat_mode)
-        if _qat_disable_fn is not None and _qat_enable_after is not None:
-            try:
-                # start with fake-quant disabled, will enable later
-                for part in model_parts:
-                    _qat_disable_fn(part)
-                logger.info("QAT fake-quant disabled initially; will enable after %s steps", _qat_enable_after)
-            except Exception as e:
-                logger.warning("Failed to disable fake-quant at setup: %s", e)
-        return _qat_disable_fn, _qat_enable_fn, _qat_enable_after
-
-    def _enable_qat_if_delayed(self, step: int):
-        if getattr(self, "_qat_enable_after", None) is None:
-            return
-        if step < self._qat_enable_after or self._qat_enable_fn is None:
-            return
-        try:
-            for mp in self.model_parts:
-                self._qat_enable_fn(mp)
-            logger.info("Enabled QAT fake-quant after step %s", step)
-            # Enable one
-            self._qat_enable_after = None
-        except Exception as e:
-            logger.warning("Failed to enable fake-quant: %s", e)
-
-    # ------------------ main loop ------------------
-    def run_train_validation_loop(self):
-        """Run the training loop over all epochs and batches.
-
-        For each batch, perform a forward pass, compute loss, backpropagate,
-        and update model parameters when necessary. Also prints loss every gradient step.
-        """
-        for mp in self.model_parts:
-            mp.train()
-        self.timestamp = time.perf_counter()
-
-        for epoch in self.step_scheduler.epochs:
-            self.step_scheduler.set_epoch(epoch)
-            # The step scheduler yields a list of batches with the following properties:
-            # 1. len(batches) == grad_acc_steps
-            # 2. len(batches[0]) == batch_size
-            for batches in self.step_scheduler:
-                # If QAT delayed fake-quant is configured, enable after threshold
-                self._enable_qat_if_delayed(self.step_scheduler.step)
-                train_log_data = self._run_train_optim_step(batches, self.max_grad_norm)
-                # Collect MoE load balance metrics (all ranks participate in all-reduce)
-                self._collect_moe_load_balance()
-                # log
-                self.log_train_metrics(train_log_data)
-
-                # Run validation every val_every_steps
-                val_losses = {}
-                if self.step_scheduler.is_val_step:
-                    for val_name, val_dataloader in self.val_dataloaders.items():
-                        val_log_data = self._run_validation_epoch(val_dataloader)
-                        val_losses[val_name] = val_log_data.metrics["val_loss"]
-                        self.log_val_metrics(val_name, val_log_data, self.metric_logger_valid[val_name])
-                    for mp in self.model_parts:
-                        mp.train()
-
-                # Save the checkpoint every ckpt_every_steps
-                if self.step_scheduler.is_ckpt_step:
-                    self.save_checkpoint(
-                        epoch,
-                        self.step_scheduler.step,
-                        train_log_data.metrics["loss"],
-                        val_losses,
-                        best_metric_key=self.best_metric_key,
-                    )
-                self._maybe_collect_garbage()
-        # Close JSONL loggers after training loop completes
-        self.metric_logger_train.close()
-        for v in self.metric_logger_valid.values():
-            v.close()
-
-        self.checkpointer.close()
-
-    # ------------------ helpers ------------------
-    def _forward_backward_step(
-        self,
-        idx,
-        batch,
-        *,
-        loss_buffer,
-        num_label_tokens,
-        num_batches,
-        is_train: bool = True,
-    ):
-        # Move batch to device (handle both tensors and dicts of tensors like causal_mask_mapping)
-        batch = {
-            k: (
-                {dk: dv.to(self.dist_env.device, non_blocking=True) for dk, dv in v.items() if dv is not None}
-                if isinstance(v, dict)
-                else (v.to(self.dist_env.device, non_blocking=True) if isinstance(v, torch.Tensor) else v)
-            )
-            for k, v in batch.items()
-        }
-        train_ctx, batch = make_cp_batch_and_ctx(
-            self.device_mesh,
-            batch,
-            use_te=_uses_te_dot_product_attention(
-                self.model_parts[0] if hasattr(self, "model_parts") else self.cfg.model
-            )
-            and _uses_thd_collater(self.cfg.dataloader),
-            padding_token_id=self.tokenizer.pad_token_id if self.tokenizer else 0,
-            num_chunks=_get_num_thd_chunks(self.pp_enabled, self.cfg),
-        )
-        labels = batch.pop("labels")
-        fp8_ctx = self.te_fp8.maybe_te_autocast() if self.te_fp8 is not None else nullcontext()
-
-        if self.pp_enabled:
-            with train_ctx(), fp8_ctx:
-                losses = [] if self.pp.info.has_last_stage else None
-                if self.pp.info.has_last_stage:
-                    masked_labels = labels.clone()
-                    targets = masked_labels
-                else:
-                    targets = None
-
-                input_ids = batch.pop("input_ids")
-
-                # Update PP stage shapes for the current batch's seq_len.
-                # This is a no-op when the length hasn't changed.
-                self.pp.update_seq_len(input_ids.shape[1])
-
-                # Filter out None values and empty dicts from batch to avoid PP chunking errors
-                batch_filtered = {
-                    k: v for k, v in batch.items() if v is not None and not (isinstance(v, dict) and len(v) == 0)
-                }
-
-                if is_train:
-                    # Use step for training (forward + backward)
-                    if self.pp.info.has_first_stage:
-                        self.pp.info.schedule.step(input_ids, target=targets, losses=losses, **batch_filtered)
-                    else:
-                        self.pp.info.schedule.step(target=targets, losses=losses, **batch_filtered)
-                else:
-                    # Use eval for validation (forward only, no backward)
-                    if self.pp.info.has_first_stage:
-                        self.pp.info.schedule.eval(input_ids, target=targets, losses=losses, **batch_filtered)
-                    else:
-                        self.pp.info.schedule.eval(target=targets, losses=losses, **batch_filtered)
-
-            if self.pp.info.has_last_stage:
-                local_loss = torch.sum(torch.stack(losses))
-            else:
-                local_loss = torch.tensor(0.0, device=self.dist_env.device)
-
-            loss_buffer.append(local_loss.clone().detach())
-        else:
-            model = self.model_parts[0]
-            sync_ctx = (
-                get_sync_ctx(
-                    model,
-                    idx == num_batches - 1,
-                    defer_fsdp_grad_sync=getattr(self.distributed_config, "defer_fsdp_grad_sync", True),
-                )
-                if is_train
-                else nullcontext()
-            )
-            with train_ctx(), sync_ctx, fp8_ctx:
-                batch = filter_forward_kwargs(model, batch)
-                if isinstance(self.loss_fn, FusedLinearCrossEntropy):
-                    # use num_logits_to_keep to avoid full logits matrix in memory
-                    out = model(logits_to_keep=1, **batch)
-                    if "hidden_states" not in out:
-                        raise ValueError(
-                            "FusedLinearCrossEntropy requires the model to output hidden states. Set `model.output_hidden_states=True` in the config."
-                        )
-                else:
-                    out = model(**batch)
-
-                local_loss = calculate_loss(
-                    self.loss_fn,
-                    logits=getattr(out, "logits", out),
-                    labels=labels,
-                    model=model,
-                    hidden_states=get_final_hidden_states(out),
-                    num_label_tokens=num_label_tokens,
-                )
-                loss_buffer.append(local_loss.clone().detach())
-                if is_train:
-                    (local_loss * self._get_dp_group_size(include_cp=True)).backward()
-
-    def _run_train_optim_step(self, batches, max_grad_norm: Optional[float] = None):
-        """Execute a single training step.
-
-        Args:
-            batches: List of batches of training data.
-            max_grad_norm: Gradient clipping norm. Optional, if None will not clip gradients.
-        """
-
-        num_label_tokens = torch.tensor(
-            sum((batch["labels"] != -100).sum().item() for batch in batches), dtype=torch.long
-        )
-        num_label_tokens = self._dp_allreduce(num_label_tokens).item()
-
-        # MoE aux loss gradients are injected via MoEAuxLossAutoScaler, which
-        # multiplies them by main_loss_backward_scale during backward.  This
-        # counteracts the unwanted scaling that FSDP and PP post-hoc rescaling
-        # apply to *all* gradients (including aux loss):
-        #
-        #   Non-PP: FSDP allreduce divides grads by dp_group_size.
-        #           Scale = dp_group_size  →  net = 1.
-        #
-        #   PP:     FSDP divides by dp_group_size, then
-        #           scale_grads_and_clip_grad_norm divides by
-        #           (num_label_tokens / dp_group_size).  The dp_group_size
-        #           factors cancel, leaving net 1/num_label_tokens.
-        #           Scale = num_label_tokens  →  net = 1.
-        if self.pp_enabled:
-            MoEAuxLossAutoScaler.main_loss_backward_scale = torch.tensor(float(num_label_tokens))
-        else:
-            MoEAuxLossAutoScaler.main_loss_backward_scale = torch.tensor(
-                float(self._get_dp_group_size(include_cp=True))
-            )
-
-        loss_buffer = []
-
-        # number of tokens in the batch, excluding any tail padding.
-        num_tokens_in_batch = torch.tensor(
-            sum(batch["labels"].numel() - count_tail_padding(batch["labels"]) for batch in batches),
-            dtype=torch.long,
-        )
-        num_tokens_in_batch = self._dp_allreduce(num_tokens_in_batch).item()
-
-        num_batches = len(batches)
-        prepare_for_grad_accumulation(self.model_parts, pp_enabled=self.pp_enabled)
-
-        for i, batch in enumerate(batches):
-            if i == num_batches - 1:
-                prepare_for_final_backward(self.model_parts, pp_enabled=self.pp_enabled)
-
-            self._forward_backward_step(
-                i, batch, loss_buffer=loss_buffer, num_label_tokens=num_label_tokens, num_batches=num_batches
-            )
-
-            if i == 0:
-                prepare_after_first_microbatch()
-
-        grad_norm = scale_grads_and_clip_grad_norm(
-            max_grad_norm,
-            self.model_parts,
-            norm_type=2.0,
-            pp_enabled=self.pp_enabled,
-            device_mesh=self.device_mesh,
-            moe_mesh=self.moe_mesh,
-            ep_axis_name="ep" if self.moe_mesh is not None and "ep" in self.moe_mesh.mesh_dim_names else None,
-            pp_axis_name="pp" if self.pp_enabled else None,
-            foreach=True,
-            num_label_tokens=num_label_tokens,
-            dp_group_size=self._get_dp_group_size(include_cp=True),
-        )
-
-        # Note(MegatronFSDP): Need to call these functions for MegatronFSDP if not using latest api
-        # self.model_parts[0].finish_grad_sync()
-
-        self.checkpointer.maybe_wait_for_staging()
-        for opt in self.optimizer:
-            opt.step()
-            opt.zero_grad()
-
-        if hasattr(self.model_parts[0], "update_moe_gate_bias"):
-            for mp in self.model_parts:
-                mp.update_moe_gate_bias()
-
-        if self.lr_scheduler is not None:
-            for scheduler in self.lr_scheduler:
-                scheduler.step(1)
-
-        # Precompute FP8 scales
-        fp8_config = self.cfg.get("fp8", None)
-        if (
-            fp8_config is not None
-            and fp8_config.get("enabled", False)
-            and fp8_config.get("precompute_float8_dynamic_scale_for_fsdp", False)
-            and not self.pp_enabled
-            and self.device_mesh is not None
-            and self.device_mesh["dp_shard"].size() > 1
-        ):
-            precompute_float8_dynamic_scale_for_fsdp(self.model_parts[0])
-
-        # Note(MegatronFSDP): Need to call these functions for MegatronFSDP if not using latest api
-        # self.model_parts[0].install_optimized_model_weights()
-        # self.model_parts[0].zero_grad_buffer()
-
-        t = time.perf_counter()
-        time_delta = t - self.timestamp
-        self.timestamp = t
-        tps = num_tokens_in_batch / time_delta
-
-        mfu = None
-        mfu_calculator = getattr(self, "mfu_calculator", None)
-        if batches and mfu_calculator is not None:
-            step_flops = 0.0
-            flops_supported = True
-            for batch in batches:
-                input_ids = batch.get("input_ids")
-                if input_ids is None:
-                    flops_supported = False
-                    break
-                batch_flops = mfu_calculator.get_flops(input_ids)
-                if batch_flops is None:
-                    flops_supported = False
-                    break
-                step_flops += float(batch_flops)
-
-            if flops_supported:
-                step_flops = self._dp_allreduce(
-                    torch.tensor(step_flops, dtype=torch.float64, device=self.dist_env.device), include_cp=True
-                ).item()
-                mfu = calculate_mfu(step_flops / 1e12, self.dist_env.world_size, time_delta)
-
-        reporting_loss = torch.sum(torch.stack(loss_buffer))
-        reporting_loss = self._dp_allreduce(reporting_loss, include_cp=True)
-        if self.pp_enabled:
-            reporting_loss = reporting_loss / num_label_tokens
-            reporting_loss = reporting_loss.to(self.dist_env.device)
-            # Send loss to first rank if pp group rank is 0
-            src_rank = self.device_mesh.mesh.reshape(-1)[-1].item()
-            if self.dist_env.rank == src_rank:
-                torch.distributed.send(reporting_loss, dst=0)
-            elif self.dist_env.is_main:
-                torch.distributed.recv(reporting_loss, src=src_rank)
-
-        reporting_loss = reporting_loss.cpu().item()
-        # fix reporting_loss, tps across ranks
-
-        return MetricsSample(
-            step=self.step_scheduler.step,
-            epoch=self.step_scheduler.epoch,
-            metrics={
-                "loss": reporting_loss,
-                "grad_norm": grad_norm,
-                "lr": self.optimizer[0].param_groups[0]["lr"],
-                "mem": torch.cuda.max_memory_allocated() / 1024**3,
-                "tps": tps,
-                "tps_per_gpu": tps / self._get_cp_group_size() / max(self._get_dp_group_size(), 1),
-                "mfu": mfu,
-                "num_tokens_per_step": num_tokens_in_batch,
-                "num_label_tokens": num_label_tokens,
-            },
-        )
-
-    @torch.no_grad()
-    def _run_validation_epoch(self, val_dataloader):
-        """Run one pass over a single validation dataloader.
-
-        Args:
-            val_name: Name of the validation dataset.
-            val_dataloader: DataLoader for the validation dataset.
-        """
-        with ScopedRNG(seed=1, ranked=True):
-            for mp in self.model_parts:
-                mp.eval()
-
-            total_loss = torch.tensor(0.0, dtype=torch.float32, device=self.dist_env.device)
-            total_num_label_tokens = 0
-
-            for batch in val_dataloader:
-                loss_buffer = []
-                num_label_tokens = (batch["labels"] != -100).sum().item()
-                self._forward_backward_step(
-                    0,
-                    batch,
-                    loss_buffer=loss_buffer,
-                    num_label_tokens=None,  # we will normalize outside.
-                    num_batches=1,
-                    is_train=False,
-                )
-
-                total_loss += torch.sum(torch.stack(loss_buffer)).item()
-                total_num_label_tokens += num_label_tokens
-
-        total_loss = self._dp_allreduce(total_loss, include_cp=True)
-        total_num_label_tokens = self._dp_allreduce(
-            torch.tensor(total_num_label_tokens, dtype=torch.long, device=self.dist_env.device)
-        ).item()
-        val_loss = total_loss / max(total_num_label_tokens, 1e-8)
-
-        # For PP, send val_loss and num_label_tokens from last stage to main rank
-        if self.pp_enabled:
-            val_loss = val_loss.to(self.dist_env.device)
-            # On non-last ranks total_num_label_tokens is 0; this tensor is just a recv buffer.
-            pp_num_tokens = torch.tensor(total_num_label_tokens, dtype=torch.long, device=self.dist_env.device)
-            src_rank = self.device_mesh.mesh.reshape(-1)[-1].item()
-            if self.dist_env.rank == src_rank:
-                torch.distributed.send(val_loss, dst=0)
-                torch.distributed.send(pp_num_tokens, dst=0)
-            elif self.dist_env.is_main:
-                torch.distributed.recv(val_loss, src=src_rank)
-                torch.distributed.recv(pp_num_tokens, src=src_rank)
-                total_num_label_tokens = pp_num_tokens.item()
-
-        val_loss = val_loss.item() if isinstance(val_loss, torch.Tensor) else val_loss
-
-        return MetricsSample(
-            step=self.step_scheduler.step,
-            epoch=self.step_scheduler.epoch,
-            metrics={
-                "val_loss": val_loss,
-                "lr": self.optimizer[0].param_groups[0]["lr"],
-                "num_label_tokens": total_num_label_tokens,
-                "mem": torch.cuda.max_memory_allocated() / 1024**3,
-            },
-        )
-
-    def log_val_metrics(self, val_name, log_data, metric_logger=None):
-        """Log metrics to wandb, MLflow and other loggers
-        Args:
-            log_data: MetricsSample object, containing:
-                step: int, the current step.
-                epoch: int, the current epoch.
-                metrics: Dict[str, float], containing:
-                    "val_loss": Validation loss.
-                    "lr": Learning rate.
-                    "num_label_tokens": Number of label tokens.
-                    "mem": Memory allocated.
-        """
-
-        if not self.dist_env.is_main or log_data is None:
-            return
-
-        if wandb.run is not None:
-            wandb.log(log_data.to_dict() | {"val_name": val_name}, step=log_data.step)
-
-        if self.mlflow_logger is not None:
-            self.mlflow_logger.log_metrics(log_data.to_dict(), step=log_data.step)
-
-        if self.comet_logger is not None:
-            self.comet_logger.log_metrics(log_data.to_dict() | {"val_name": val_name}, step=log_data.step)
-
-        # JSONL validation log
-        if not metric_logger is None:
-            metric_logger.log(log_data)
-
-        logging.info(
-            '[val] name "{}" | step {} | epoch {} | loss {:.4f} | lr {:.2e} | num_label_tokens {}'.format(
-                val_name,
-                log_data.step,
-                log_data.epoch,
-                log_data.metrics["val_loss"],
-                log_data.metrics["lr"],
-                log_data.metrics["num_label_tokens"],
-            )
-        )
-
-    def log_train_metrics(self, log_data):
-        """Log metrics to wandb and other loggers.
-
-        Args:
-            log_data: MetricsSample object, containing:
-                step: int, the current step.
-                epoch: int, the current epoch.
-                metrics: Dict[str, float], containing:
-                    "loss": Training loss.
-                    "grad_norm": Grad norm from the training step.
-                    "lr": Learning rate.
-                    "mem": Memory allocated.
-                    "tps": Tokens per second.
-                    "tps_per_gpu": Tokens per second per GPU.
-                    "num_label_tokens": Number of label tokens.
-        """
-        if not self.dist_env.is_main:
-            return
-
-        # Log to remote services (WandB, MLflow, Comet) according to step_scheduler frequency
-        if self.step_scheduler.is_remote_logging_step:
-            if wandb.run is not None:
-                wandb.log(log_data.to_dict(), step=self.step_scheduler.step)
-            if self.mlflow_logger is not None:
-                self.mlflow_logger.log_metrics(log_data.to_dict(), step=log_data.step)
-            if self.comet_logger is not None:
-                self.comet_logger.log_metrics(log_data.to_dict(), step=log_data.step)
-
-        # Log MoE load balance metrics (already collected/reduced on all ranks)
-        if self.step_scheduler.is_remote_logging_step:
-            if wandb.run is not None:
-                self._log_moe_metrics(self.step_scheduler.step, wandb.log)
-            if self.comet_logger is not None:
-                self._log_moe_metrics(
-                    self.step_scheduler.step, lambda m, step: self.comet_logger.log_metrics(m, step=step)
-                )
-
-        # JSONL training log (always log for detailed local records)
-        self.metric_logger_train.log(log_data)
-        logging.info(
-            "step {} | epoch {} | loss {:.4f} | grad_norm {:.4f} | lr {:.2e} | mem {:.2f} GiB | tps {:.2f}({:.2f}/gpu) | num_label_tokens {}".format(
-                log_data.step,
-                log_data.epoch,
-                log_data.metrics["loss"],
-                log_data.metrics["grad_norm"],
-                log_data.metrics["lr"],
-                log_data.metrics["mem"],
-                log_data.metrics["tps"],
-                log_data.metrics["tps_per_gpu"],
-                log_data.metrics["num_label_tokens"],
-            )
-        )
-        torch.cuda.reset_peak_memory_stats()
-
-
-# ---------------------------------------------------------------------------
-# Entry point
-# ---------------------------------------------------------------------------
-
-
-def main(config_path=None):
-    """Main entry point for the fine-tuning recipe.
-
-    Loads the configuration, sets up the trainer, and initiates the training loop.
-    """
-    if config_path is None:
-        config_path = pathlib.Path(__file__).parent.resolve() / "llama_3_2_1b_hellaswag.yaml"
-    cfg = parse_args_and_load_config(config_path)
-    trainer = TrainFinetuneRecipeForNextTokenPrediction(cfg)
-    trainer.setup()
-    trainer.run_train_validation_loop()
-
-
-if __name__ == "__main__":
-    main()
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/_peft/lora.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import Any, Literal, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import Shard as _Shard
-
-from nemo_automodel.components._peft.lora_experts import GroupedExpertsDeepEPLoRA, GroupedExpertsLoRA
-from nemo_automodel.components._peft.lora_kernel import (
-    lora_da_dx_update_wrapper,
-    lora_db_update_wrapper,
-    lora_forward_wrapper,
-)
-from nemo_automodel.components._peft.module_matcher import ModuleMatcher
-from nemo_automodel.components.moe.layers import GroupedExperts, GroupedExpertsDeepEP, GroupedExpertsTE
-from nemo_automodel.shared.import_utils import safe_import, safe_import_te
-from nemo_automodel.shared.utils import dtype_from_str
-
-HAS_BNB, bitsandbytes = safe_import("bitsandbytes")
-HAS_TE, transformer_engine = safe_import_te()
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class PeftConfig:
-    target_modules: list = field(default_factory=list)
-    exclude_modules: list = field(default_factory=list)
-    match_all_linear: bool = False
-    dim: int = 8
-    alpha: int = 32
-    # Note: we currently support DoRA for nn.Linear only.
-    use_dora: bool = False
-    dropout: float = 0.0
-    dropout_position: Literal["pre", "post"] = "post"
-    lora_A_init: str = "xavier"
-    lora_dtype: Optional[torch.dtype] = None
-    use_triton: bool = False
-    moe_rank_scaling: bool = False
-
-    def to_dict(self):
-        return self.__dict__.copy()
-
-    @classmethod
-    def from_dict(cls, d: dict[str, Any]):
-        return cls(
-            target_modules=d.get("target_modules", []),
-            exclude_modules=d.get("exclude_modules", []),
-            match_all_linear=d.get("match_all_linear", False),
-            dim=d.get("dim", 8),
-            alpha=d.get("alpha", 32),
-            use_dora=d.get("use_dora", False),
-            dropout=d.get("dropout", 0.0),
-            dropout_position=d.get("dropout_position", "post"),
-            lora_A_init=d.get("lora_A_init", "xavier"),
-            lora_dtype=d.get("lora_dtype", None),
-            use_triton=d.get("use_triton", False),
-            moe_rank_scaling=d.get("moe_rank_scaling", False),
-        )
-
-
-class LinearLoRA(nn.Linear):
-    """
-    Linear + LoRA, maintains ckpts structure (i.e. Linear's weight/bias remain at the same FQN).
-
-    The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to
-    use those inside LinearLoRA but also for monkey-patching modules, without repeating the
-    same code -> therefore those are decorated with @staticmethod.
-    """
-
-    def __init__(
-        self,
-        orig_linear,
-        dim=8,
-        alpha=32,
-        use_dora: bool = False,
-        dropout=0.0,
-        dropout_position="post",
-        lora_A_init_method="xavier",
-        lora_dtype=None,
-    ):
-        """
-        LinearLora constructor.
-
-        Args:
-            orig_linear (nn.Module): the linear module to augment.
-            dim (int): lora's dim in_features -> dim -> out_features.
-            alpha (int): lora's scaling alpha.
-            dropout (float): dropout prob (default: 0.0).
-            dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
-            lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
-            lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
-            are quantized weights (e.g. 4bit) needs to be specified explicitly.
-        """
-        assert isinstance(orig_linear, nn.Linear)
-        super(LinearLoRA, self).__init__(
-            in_features=orig_linear.in_features,
-            out_features=orig_linear.out_features,
-            bias=orig_linear.bias is not None,
-            device=orig_linear.weight.device,
-            dtype=orig_linear.weight.dtype,
-        )
-        # copy weights
-        self.weight.data.copy_(orig_linear.weight.data)
-        if orig_linear.bias is not None:
-            self.bias.data.copy_(orig_linear.bias.data)
-        # initialize the adapte
-        LinearLoRA._init_adapter(
-            self,
-            dim=dim,
-            alpha=alpha,
-            use_dora=use_dora,
-            dropout=dropout,
-            dropout_position=dropout_position,
-            lora_A_init_method=lora_A_init_method,
-            lora_dtype=lora_dtype,
-        )
-
-    @torch.no_grad
-    def init_lora_weights(self, init_method: str):
-        """
-        Initialize the LoRA weights.
-
-        Args:
-            init_method (str): Method to initialize the LoRA weights.
-        """
-        if init_method == "xavier":
-            nn.init.xavier_normal_(self.lora_A.weight.data)
-        else:
-            nn.init.kaiming_uniform_(self.lora_A.weight.data, a=math.sqrt(5))
-        self.lora_B.weight.data.fill_(0)
-
-    @torch.no_grad
-    @staticmethod
-    def _init_adapter(
-        obj,
-        dim=8,
-        alpha=32,
-        use_dora: bool = False,
-        dropout=0.0,
-        dropout_position="post",
-        lora_A_init_method="xavier",
-        lora_dtype=None,
-    ):
-        """
-        Adds LoRA weights to obj. Obj is either a LinearLoRA or an nn.Module (when monkey-patching).
-
-        Args:
-            obj (LinearLoRA | nn.Module): input module to adapt.
-            dim (int): lora's dim in_features -> dim -> out_features.
-            alpha (int): lora's scaling alpha.
-            dropout (float): dropout prob (default: 0.0).
-            dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
-            lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
-            lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
-            are quantized weights (e.g. 4bit) needs to be specified explicitly.
-        """
-        obj.dim = dim
-        obj.scale = alpha / dim
-        obj.use_dora = bool(use_dora)
-
-        # Freezer
-        device = obj.weight.device
-        obj.weight.requires_grad = False
-        if obj.bias is not None:
-            obj.bias.requires_grad = False
-
-        in_features = obj.in_features
-        out_features = obj.out_features
-        if isinstance(lora_dtype, str):
-            lora_dtype = dtype_from_str(lora_dtype)
-        assert lora_dtype is None or isinstance(lora_dtype, torch.dtype)
-        dtype = lora_dtype or obj.weight.dtype
-
-        if HAS_TE and isinstance(obj, transformer_engine.pytorch.Linear):
-            obj.lora_A = transformer_engine.pytorch.Linear(
-                in_features=in_features, out_features=dim, bias=False, device=device, params_dtype=dtype
-            )
-            obj.lora_B = transformer_engine.pytorch.Linear(
-                in_features=dim, out_features=out_features, bias=False, device=device, params_dtype=dtype
-            )
-        else:
-            obj.lora_A = nn.Linear(in_features, dim, bias=False, dtype=dtype, device=device)
-            obj.lora_B = nn.Linear(dim, out_features, bias=False, dtype=dtype, device=device)
-        LinearLoRA.init_lora_weights(obj, lora_A_init_method)
-        obj.dropout_p = dropout
-        assert dropout_position in ["pre", "post"], ("dropout position can only be pre/post", dropout_position)
-        obj.dropout_position = dropout_position
-
-        if obj.use_dora:
-            # initialize DoRA magnitude vector to ||W|| (row-wise L2 norm).
-            with torch.no_grad():
-                weight_norm = torch.linalg.norm(obj.weight.data, dim=1).to(dtype=dtype, device=device)
-            obj.lora_magnitude = nn.Parameter(weight_norm, requires_grad=True)
-
-    def _dora_weight_norm(self) -> torch.Tensor:
-        """
-        Compute the detached weight norm used by DoRA.
-        """
-        # ΔW = B @ A, shapes: [out, dim] @ [dim, in] => [out, in]
-        delta_w = (self.lora_B.weight @ self.lora_A.weight).detach().to(self.weight.dtype)
-        weight = self.weight.to(self.weight.dtype)
-        weight_norm = torch.linalg.norm(weight + self.scale * delta_w, dim=1).to(weight.dtype)
-        return weight_norm.detach()
-
-    def forward(self, x):
-        """
-        Forward pass through the original linear layer augmented with the LoRA pathway.
-
-        Applies LoRA either before or after the dropout, depending on the configuration.
-        The result of the original linear transformation is combined with the LoRA output.
-
-        Args:
-            x (Tensor): Input tensor of shape (batch_size, in_features).
-
-        Returns:
-            Tensor: Output tensor of shape (batch_size, out_features).
-        """
-        # pylint: disable=C0115,C0116
-        # If LinearLoRA is used to monkey-patch a nn.Linear module, we want to use nn.Linear's
-        # forward in the case where it uses quantized weights. We store a reference to nn.Linear's
-        # forward in `super_fwd` attribute. If the attribute does not exist we do the usual linear.
-        if (fwd := getattr(self, "super_fwd", None)) is not None:
-            assert fwd != self.forward
-            res = fwd(x)
-        else:
-            # TE Linear can expose an empty .bias tensor (numel()==0) when bias=False; treat as no bias.
-            bias = self.bias
-            if bias is not None and bias.numel() == 0:
-                bias = None
-            # bmm avoids aten.view which cannot flatten a sharded dimension.
-            # F.linear calls view([b,s,h]->[b*s,h]) which fails when dim 0/1 is sharded
-            # (sequence parallelism) or during AOT-autograd tracing with compile.
-            _x_needs_bmm = (
-                isinstance(x, DTensor)
-                and x.dim() == 3
-                and any(isinstance(p, _Shard) and p.dim < 2 for p in x.placements)
-            )
-            if torch.compiler.is_compiling() or _x_needs_bmm:
-                b = x.shape[0]
-                res = torch.bmm(x, self.weight.t().unsqueeze(0).expand(b, -1, -1))
-                if bias is not None:
-                    res = res + bias
-            else:
-                res = F.linear(x, self.weight, bias)
-
-        if not self.use_dora:
-            if self.dropout_position == "pre":
-                x = F.dropout(x, p=self.dropout_p, training=self.training)
-
-            # Apply scale before lora_B to keep lora_res as a Partial tensor.
-            # This allows both res and lora_res to remain Partial, so only one reduce-scatter is needed after addition.
-            # Multiplying after lora_B would convert Partial to Replicate, causing an extra reduce-scatter operation.
-            lora_res = self.lora_B(self.lora_A(x) * self.scale)
-            if self.dropout_position == "post":
-                lora_res = F.dropout(lora_res, p=self.dropout_p, training=self.training)
-            return res + lora_res
-
-        if getattr(self, "lora_magnitude", None) is None:
-            raise RuntimeError("use_dora=True but lora_magnitude was not initialized")
-
-        if self.dropout_position == "pre" and self.training and self.dropout_p > 0.0:
-            x_lora = F.dropout(x, p=self.dropout_p, training=True)
-            base_result = None
-        else:
-            x_lora = x
-            base_result = res
-
-        lora_result = self.lora_B(self.lora_A(x_lora))
-        if self.dropout_position == "post":
-            lora_result = F.dropout(lora_result, p=self.dropout_p, training=self.training)
-
-        # Compute DoRA scaling factor.
-        weight_norm = self._dora_weight_norm()
-        mag = self.lora_magnitude.to(x.dtype)
-        weight_norm = weight_norm.to(x.dtype)
-
-        # Broadcast magnitude scaling across batch/sequence dimensions.
-        mag_norm_scale = mag / weight_norm
-        if res.dim() == 3:
-            mag_norm_scale = mag_norm_scale.view(1, 1, -1)
-        else:
-            mag_norm_scale = mag_norm_scale.view(1, -1)
-
-        # HF PEFT subtracts bias from base_result before applying scaling terms.
-        if base_result is not None:
-            bias = self.bias
-            if bias is not None and bias.numel() > 0:
-                base_no_bias = base_result - bias
-            else:
-                base_no_bias = base_result
-        else:
-            # Recompute base linear output without bias on x_lora (see HF PEFT DoraLinearLayer.forward).
-            base_no_bias = F.linear(x_lora, self.weight, None)
-
-        dora_extra = (mag_norm_scale - 1) * base_no_bias + mag_norm_scale * lora_result * self.scale
-        return res + dora_extra
-
-
-class TritonLinearLoRA(LinearLoRA):
-    """
-    Subclass of LinearLoRA that uses triton kernels for forward and backward passes.
-
-    Args:
-        orig_linear (nn.Module): the linear module to augment.
-        dim (int): lora's dim in_features -> dim -> out_features.
-        alpha (int): lora's scaling alpha.
-        dropout (float): dropout prob (default: 0.0).
-        dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
-        lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
-        lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
-        are quantized weights (e.g. 4bit) needs to be specified explicitly.
-    """
-
-    def forward(self, x):
-        """
-        Forward function for LoRA with triton kernels.
-
-        Args:
-            x (torch.Tensor): the input tensor.
-
-        Returns:
-            torch.Tensor: the output tensor.
-        """
-        # If LinearLoRA is used to monkey-patch a nn.Linear module, we want to use nn.Linear's
-        # forward in the case where it uses quantized weights. We store a reference to nn.Linear's
-        # forward in `super_fwd` attribute. If the attribute does not exist we do the usual linear.
-        if (fwd := getattr(self, "super_fwd", None)) is not None:
-            assert fwd != self.forward
-            res = fwd(x)
-        else:
-            res = F.linear(x, self.weight, self.bias)
-
-        if self.dropout_position == "pre":
-            x = F.dropout(x, p=self.dropout_p, training=self.training)
-        lora_res = LoRATritonFunction.apply(x, self.lora_A.weight, self.lora_B.weight, self.scale, x.dtype)
-        if self.dropout_position == "post":
-            lora_res = F.dropout(lora_res, p=self.dropout_p, training=self.training)
-
-        return res + lora_res
-
-
-def patch_linear_module(
-    orig_linear,
-    dim=8,
-    alpha=32,
-    use_dora: bool = False,
-    dropout=0.0,
-    dropout_position="post",
-    lora_A_init_method="xavier",
-    lora_dtype=None,
-    use_triton=True,
-    layer_name=None,
-):
-    """
-    Monkey-patches a nn.Linear (orig_linear param) to be a LinearLoRA.
-
-    The orig_linear might not contain valid weights, for example, the given orig_linear was
-    initialized within a context-manager that uses a "meta" device. Therefore, we cannot copy
-    the weight/bias from the orig_linear to the LinearLoRA, since those have not been allocated,
-
-    To circumvent this scenario, LinearLoRA's additional functionality (_init_adapter, _forward)
-    is based on static functions, so that we can use them for patching or when allocating a
-    new LinearLoRA object.
-
-    Args:
-        orig_linear (nn.Linear): the module we add adapter to.
-        dim (int, optional): Lora dim. Defaults to 8.
-        alpha (int, optional): Lora alpha scale. Defaults to 32.
-        dropout (float, optional): dropout prob. Defaults to 0.0.
-        dropout_position (str, optional): location to apply dropout wrt lora.
-            Defaults to 'post' (choices: 'pre', 'post').
-        lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'.
-        lora_dtype (_type_, optional): Lora weights' dtype. By default will use orig_linear's dtype
-        but orig_linear might use non-trainable dtype (e.g., 4bit), in which case the user must
-        specify the dtype manually. Defaults to None.
-        use_triton (bool, optional): By default we use the triton kernel LoRA implementation.
-
-    Returns:
-        (nn.Module): the monkey-patched (nn.Linear + LoRA) nn.Module
-    """
-    linear_types = [nn.Linear]
-    if HAS_TE:
-        linear_types.append(transformer_engine.pytorch.Linear)
-        use_triton = False
-    if not isinstance(orig_linear, tuple(linear_types)):
-        raise NotImplementedError("Expected isinstance(orig_linear, nn.Linear)")
-    assert not hasattr(orig_linear, "super_fwd"), orig_linear.super_fwd
-
-    if use_dora:
-        if HAS_TE and isinstance(orig_linear, transformer_engine.pytorch.Linear):
-            raise ValueError("DoRA is not supported for transformer_engine.pytorch.Linear layers.")
-        if getattr(orig_linear, "quant_state", None) is not None:
-            raise ValueError("DoRA is not supported for quantized linear layers (e.g., BitsAndBytes).")
-        use_triton = False
-
-    linear_lora_cls = TritonLinearLoRA if use_triton else LinearLoRA
-    linear_lora_cls._init_adapter(
-        orig_linear,
-        dim=dim,
-        alpha=alpha,
-        use_dora=use_dora,
-        dropout=dropout,
-        dropout_position=dropout_position,
-        lora_A_init_method=lora_A_init_method,
-        lora_dtype=lora_dtype,
-    )
-    cls = orig_linear.__class__
-    new_cls = type("PatchedLinearLoRA", (linear_lora_cls, cls), {})
-
-    # If the model uses quantized weights, we want to use orig_linear's forward
-    if (
-        getattr(orig_linear, "quant_state", None) is not None
-        and orig_linear.quant_state.__class__ == bitsandbytes.functional.QuantState
-    ):
-        if HAS_TE:
-            assert not isinstance(orig_linear, transformer_engine.pytorch.Linear), (
-                "quant_state is not supported with transformer_engine.pytorch.Linear"
-            )
-        orig_linear.super_fwd = orig_linear.forward
-    elif HAS_TE and isinstance(orig_linear, transformer_engine.pytorch.Linear):
-        # Delegate base computation to TE's forward so TE kernels (including FP8)
-        # are used instead of falling back to F.linear().
-        orig_linear.super_fwd = orig_linear.forward
-
-    orig_linear.__class__ = new_cls
-    if layer_name is not None:
-        orig_linear._layer_name = layer_name
-    return orig_linear
-
-
-def patch_moe_module(
-    orig_module,
-    dim=8,
-    alpha=32,
-    lora_A_init_method="xavier",
-    lora_dtype=None,
-):
-    """
-    Patches a custom MoE module (GroupedExperts or GroupedExpertsDeepEP) with LoRA.
-
-    Args:
-        orig_module (nn.Module): The original MoE module to be patched.
-        dim (int, optional): LoRA rank (dimension). Defaults to 8.
-        alpha (int, optional): LoRA scaling factor. Defaults to 32.
-        lora_A_init_method (str, optional): Initialization method for LoRA A matrix. Defaults to "xavier".
-        lora_dtype (torch.dtype or str, optional): Data type for LoRA weights. Defaults to None.
-
-    Returns:
-        nn.Module: The LoRA-wrapped MoE module (GroupedExpertsLoRA or GroupedExpertsDeepEPLoRA).
-    """
-    if isinstance(orig_module, GroupedExpertsTE):
-        raise NotImplementedError("LoRA is not supported for Transformer Engine (TE) expert modules.")
-    elif isinstance(orig_module, GroupedExpertsDeepEP):
-        new_module = GroupedExpertsDeepEPLoRA(
-            orig_module,
-            lora_dim=dim,
-            alpha=alpha,
-            lora_A_init_method=lora_A_init_method,
-            lora_dtype=lora_dtype,
-        )
-    elif isinstance(orig_module, GroupedExperts):
-        new_module = GroupedExpertsLoRA(
-            orig_module,
-            lora_dim=dim,
-            alpha=alpha,
-            lora_A_init_method=lora_A_init_method,
-            lora_dtype=lora_dtype,
-        )
-    else:
-        raise NotImplementedError(f"Unsupported MoE module type: {type(orig_module)}")
-
-    return new_module
-
-
-# patch a model in-place
-def apply_lora_to_linear_modules(
-    model: nn.Module,
-    peft_config: PeftConfig,
-    quantization_config=None,
-    skip_freeze: bool = False,
-) -> int:
-    """
-    Replace selected nn.Linear layers with LinearLoRA layers (in-place).
-
-    Args:
-        model: The model to apply LoRA to.
-        peft_config: PEFT configuration for LoRA parameters.
-        quantization_config: Optional separate QLoRA quantization configuration.
-        skip_freeze: If True, skip the global parameter freeze (caller will handle it later).
-
-    Returns:
-        Number of modules that were modified with LoRA.
-
-    Note:
-        target_modules accepts wildcard fragments, e.g. ["q_proj", "k_proj", ".*fc.*"].
-    """
-    # Freeze base model parameters
-    if not skip_freeze:
-        for w in model.parameters():
-            w.requires_grad_(False)
-
-    is_causal_lm = False
-    try:
-        if (
-            hasattr(model, "config")
-            and model.config.architectures is not None
-            and len(model.config.architectures) > 0
-            and "CausalLM" in model.config.architectures[0]
-        ):
-            # for example, LlamaForCausalLM
-            is_causal_lm = True
-    except (AttributeError, TypeError):
-        is_causal_lm = False
-
-    matcher = ModuleMatcher(
-        peft_config.target_modules, peft_config.exclude_modules, peft_config.match_all_linear, is_causal_lm
-    )
-    num_modules_matched = 0
-    for name, module in list(model.named_modules()):
-        if isinstance(module, (GroupedExperts, GroupedExpertsDeepEP, GroupedExpertsTE)):
-            if matcher.match(module, name):
-                if peft_config.use_dora:
-                    raise NotImplementedError("DoRA is not supported for MoE expert modules in Automodel yet.")
-                num_modules_matched += 1
-                lora_dtype = peft_config.lora_dtype
-                if quantization_config is not None and lora_dtype is None:
-                    lora_dtype = quantization_config.bnb_4bit_compute_dtype or torch.bfloat16
-
-                # Compute effective LoRA rank for MoE modules
-                moe_dim = peft_config.dim
-                if peft_config.moe_rank_scaling:
-                    n_act = module.config.n_activated_experts
-                    moe_dim = peft_config.dim // n_act
-                    if moe_dim < 1:
-                        raise ValueError(
-                            f"moe_rank_scaling: dim={peft_config.dim} // n_activated_experts={n_act} "
-                            f"gives rank {moe_dim}. Increase dim to at least n_activated_experts."
-                        )
-                    if peft_config.dim % n_act != 0:
-                        logger.warning(
-                            "moe_rank_scaling: dim=%d is not evenly divisible by n_activated_experts=%d; "
-                            "using floor division rank=%d.",
-                            peft_config.dim,
-                            n_act,
-                            moe_dim,
-                        )
-
-                # Replace the module in the model
-                new_module = patch_moe_module(
-                    module,
-                    dim=moe_dim,
-                    alpha=peft_config.alpha,
-                    lora_A_init_method=peft_config.lora_A_init,
-                    lora_dtype=lora_dtype,
-                )
-
-                # Find parent and replace
-                if "." not in name:
-                    setattr(model, name, new_module)
-                else:
-                    parent_name, child_name = name.rsplit(".", 1)
-                    parent = model.get_submodule(parent_name)
-                    setattr(parent, child_name, new_module)
-        else:
-            # Standard Linear patching
-            linear_types = [nn.Linear] + ([transformer_engine.pytorch.Linear] if HAS_TE else [])
-            if isinstance(module, tuple(linear_types)) and matcher.match(module, name):
-                num_modules_matched += 1
-                # For QLora, set lora_dtype to float16/bfloat16 since base weights are quantized
-                lora_dtype = peft_config.lora_dtype
-                if quantization_config is not None and lora_dtype is None:
-                    lora_dtype = quantization_config.bnb_4bit_compute_dtype or torch.bfloat16
-
-                patch_linear_module(
-                    module,
-                    dim=peft_config.dim,
-                    alpha=peft_config.alpha,
-                    use_dora=peft_config.use_dora,
-                    dropout=peft_config.dropout,
-                    dropout_position=peft_config.dropout_position,
-                    lora_A_init_method=peft_config.lora_A_init,
-                    lora_dtype=lora_dtype,
-                    use_triton=peft_config.use_triton,
-                    layer_name=name,
-                )
-
-    return num_modules_matched
-
-
-class LoRATritonFunction(torch.autograd.Function):
-    """
-    Autograd function that calls the triton kernel wrappers for the LoRA forward and backward passes.
-    """
-
-    @staticmethod
-    def setup_context(ctx, inputs, output):
-        """
-        Stores context for LoRA backward pass.
-        """
-        x, lora_A, lora_B, scale, _ = inputs
-        ctx.save_for_backward(x, lora_A, lora_B)
-        ctx.scale = scale
-
-    @staticmethod
-    def forward(x, lora_A, lora_B, scale, dtype):
-        """
-        Forward method for LoRATriton.
-
-        Reshapes 3D tensors into 2D and then calls the triton kernel.
-        """
-        reshape = x.dim() == 3
-        if reshape:
-            bs, seq_len, d = x.shape
-            x = x.reshape(-1, d)
-
-        lora_res = lora_forward_wrapper(x, lora_A.t(), lora_B.t(), res=None, scale=scale, dtype=dtype)
-
-        if reshape:
-            return lora_res.view(bs, seq_len, -1)
-        else:
-            return lora_res
-
-    @staticmethod
-    def backward(ctx, d_y):
-        """
-        Backward method for LoRATriton.
-
-        Reshapes 3D tensors into 2D and then calls the kernels to update d_lora_a, d_lora_b, and dx.
-        """
-        x, lora_A, lora_B = ctx.saved_tensors
-        scale = ctx.scale
-        dtype = x.dtype
-
-        reshape = x.dim() == 3
-        if reshape:
-            bs, seq_len, d = x.shape
-            d_y = d_y.reshape(-1, d_y.shape[-1])
-            x = x.reshape(-1, d)
-
-        d_lora_A, d_x = lora_da_dx_update_wrapper(x.t(), d_y, lora_B, lora_A, scale, dtype=dtype)
-        d_lora_B = lora_db_update_wrapper(lora_A, x.t(), d_y, scale, dtype)
-
-        if reshape:
-            d_x = d_x.view(bs, seq_len, d)
-        return d_x, d_lora_A.t(), d_lora_B, None, None
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/_peft/module_matcher.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-from dataclasses import dataclass, field
-from typing import List
-
-import torch.nn as nn
-
-from nemo_automodel.shared.import_utils import safe_import_te
-
-HAS_TE, transformer_engine = safe_import_te()
-import logging
-
-logger = logging.getLogger(__name__)
-from functools import lru_cache
-
-
-def _is_linear_module(module):
-    return isinstance(module, nn.Linear) or (HAS_TE and isinstance(module, transformer_engine.pytorch.Linear))
-
-
-@lru_cache(maxsize=1000)
-def _compile_wildcard_pattern(pattern):
-    pattern = re.sub(r"(?<!\.)\*", r".*", pattern)  # replace [^\.]* with `.*` ie insert "." before "*"
-    pattern = re.sub(r"\.\*", "(.*)", pattern)  # replace .* -> (.*)
-    return re.compile("^" + pattern + "$")
-
-
-def wildcard_match(pattern, key):
-    """
-    Return whether the pattern (target module to add LoRA) matches the key (model weight name).
-
-    Example:
-    --------
-        >>> wildcard_match("*.layers.0.*.linear_qkv", "decoder.layers.0.self_attention.linear_qkv")
-        True
-        >>> wildcard_match("*.layers.0.*.linear_qkv", "decoder.layers.1.self_attention.linear_qkv")
-        False
-    """
-    if key is None:
-        return False
-    regex_pattern = _compile_wildcard_pattern(pattern)
-    match = regex_pattern.match(key)
-    return match is not None
-
-
-@dataclass
-class ModuleMatcher:
-    """
-    Matches Modules to apply PEFT adapters on.
-
-    Args:
-        target_modules (List[str], optional): A list of module names to apply LoRA to.
-            Defaults to an empty list.
-            If empty and no other parameter is provided it will match to "*_proj".
-            Target modules can also contain wildcards (e.g. "*.layers.0.*.linear_qkv"). For example, you can specify
-                target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv
-                on the first two layers.
-        exclude_modules (List[str], optional): A list of module names to exclude from applying LoRA to.
-            Defaults to an empty list.
-            Exclude modules can also contain wildcards (e.g. "*.lm_head"). For example, you can specify
-                exclude_modules=['*.lm_head'] to exclude the lm_head.
-        match_all_linear (bool, optional): Whether to match all linear layers.
-            Defaults to False. Prefer using target_modules or exclude_modules to specify the modules to match,
-            to avoid issues with downstream tools (e.g., vLLM, etc).
-        is_causal_lm (bool, optional): Whether the model is a causal language model.
-    """
-
-    target_modules: List[str] = field(default_factory=list)
-    exclude_modules: List[str] = field(default_factory=list)
-    match_all_linear: bool = field(default=False)
-    is_causal_lm: bool = field(default=False)
-
-    def __post_init__(self):
-        """
-        Input validation.
-        """
-        if self.target_modules is None:
-            self.target_modules = []
-        if self.exclude_modules is None:
-            self.exclude_modules = []
-        if isinstance(self.target_modules, str):
-            self.target_modules = [self.target_modules]
-        if isinstance(self.exclude_modules, str):
-            self.exclude_modules = [self.exclude_modules]
-        if self.match_all_linear is False and len(self.target_modules) == 0 and len(self.exclude_modules) == 0:
-            logger.warning(
-                "No modules specified for LoRA. Will use target_modules='*_proj' by default."
-                """
-            Equivalent to the following YAML configuration:
-            peft:
-              target_modules: '*_proj'
-            If this is not what you want, please specify target_modules or exclude_modules.
-            """
-            )
-            self.target_modules = ["*_proj"]
-
-        if self.target_modules and self.exclude_modules:
-            raise ValueError(
-                "target_modules and exclude_modules are mutually exclusive. Please provide only one of them."
-            )
-        if self.match_all_linear and (len(self.target_modules) > 0 or len(self.exclude_modules) > 0):
-            raise ValueError(
-                "Expected target_modules/exclude_modules to be empty when match_all_linear is true. Please provide only one of them."
-            )
-        if self.match_all_linear:
-            logger.warning(
-                "match_all_linear is true. This will match all linear layers in the model (including lm_head). "
-                "Please consider using target_modules or exclude_modules to specify the modules to match, to avoid issues with downstream tools "
-                "For example, to match all linear layers except the lm_head, you can use: "
-                "peft: "
-                "  target_modules: '*_proj' "
-            )
-
-    # --------------------------------------------------------------------- #
-    # Public API                                                            #
-    # --------------------------------------------------------------------- #
-    def match(self, m: nn.Module, name: str = None, prefix: str = None):
-        """
-        Return (pattern, full_name) if the module matches; otherwise None.
-        """
-        full_name = f"{prefix}.{name}" if prefix else name
-
-        # 1. matching by layer type takes absolute precedence
-        if self.match_all_linear and _is_linear_module(m):
-            return True
-
-        # 2. target_modules is the next most-specific rule set
-        elif self.target_modules:
-            assert not self.exclude_modules, "`exclude_modules` must be empty when `target_modules` is used."
-            for pattern in self.target_modules:
-                if name == pattern or wildcard_match(pattern, full_name):
-                    return True
-            return False
-        # 3. Fallback: “all linear layers except those explicitly excluded”
-        else:
-            return (
-                name not in self.exclude_modules
-                and not any(wildcard_match(pattern, full_name) for pattern in self.exclude_modules)
-                and _is_linear_module(m)
-            )
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/datasets/utils.py
-```py
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Optional
-
-import torch
-from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
-
-
-def batchify(tensor, default_tensor_cls=torch.LongTensor):
-    """
-    Ensures that the input tensor has at least two dimensions by adding an extra batch dimension if necessary.
-
-    Args:
-        tensor (torch.Tensor): The input tensor to be batchified.
-
-    Returns:
-        torch.Tensor:  The tensor with an extra dimension added if it was originally 1-dimensional.
-        Otherwise, the tensor is returned as-is.
-    """
-    if not isinstance(tensor, torch.Tensor):
-        tensor = default_tensor_cls(tensor)
-    if tensor.ndim == 1:
-        return tensor.unsqueeze_(0)
-    return tensor
-
-
-def extract_key_from_dicts(batch, key):
-    """
-    Extracts the value of the given key from each dictionary in a list of dictionaries.
-
-    Args:
-        batch (List[dict]): A list of dictionaries.
-        key (str): The key whose values are to be extracted from each dictionary.
-
-    Returns:
-        List: A list of values associated with the specified key, in the same order as
-        the dictionaries in the input batch.
-    """
-    return list(map(lambda x: x[key], batch))
-
-
-def pad_within_micro(batch, pad_token_id, pad_seq_len_divisible=None):
-    """
-    Pads each list in a batch of lists to the same length with a specified token.
-
-    Args:
-        batch (List[List[int]]): A batch of sequences (e.g., token IDs), where each sequence
-            is a list of integers.
-        pad_token_id (int): The token ID to use for padding shorter sequences.
-        pad_seq_len_divisible (int): The value to use for padding sequence length so that it is
-            divisible by pad_seq_len_divisible.
-
-    Returns:
-        List[List[int]]: A batch of sequences where each inner list has been padded with the pad
-        token to match the length of the longest sequence in the batch.
-    """
-    max_len = max(map(len, batch))
-    if pad_seq_len_divisible:
-        max_len = math.ceil(max_len / pad_seq_len_divisible) * pad_seq_len_divisible
-    if pad_token_id is None:
-        # if it's none, extend the last token
-        pad_token_id = batch[0][-1]
-    return [item + [pad_token_id] * (max_len - len(item)) for item in batch]
-
-
-def find_last_non_pad_token(lst: list[int], value: int) -> int | None:
-    # lst = [optional-value .., non-value, ..., non-value, value, ...]
-    # return the index of the last non-value token
-    i = len(lst) - 1
-    found = False
-    while i >= 0:
-        if lst[i] == value:
-            i -= 1
-            found = True
-        else:
-            if found:
-                return i
-            else:
-                return None
-    return None
-
-
-def get_pad_token_from_key(val: str, pad_token_ids: Optional[dict[str, int]] = None) -> int | None:
-    PAD_TOKEN_IDS = {
-        "labels": -100,
-        "attention_mask": 0,
-        "loss_mask": 0,
-        "input_ids": 0,
-    }
-    if pad_token_ids is None:
-        pad_token_ids = {}
-    ans = pad_token_ids.get(val, PAD_TOKEN_IDS.get(val, None))
-    return ans
-
-
-def make_attention_mask_from_labels(ids: list[int], ignore_token: int = -100) -> list[int]:
-    # if the last token is not an ignore token, then the attention mask is all 1s
-    if len(ids) == 0:
-        return []
-    if ids[-1] != ignore_token:
-        ans = [1] * len(ids)
-    else:
-        # otherwise, find the last non-pad token and set the attention mask to 1s up to that point
-        last_non_pad_token_pos = find_last_non_pad_token(ids, ignore_token)
-        if last_non_pad_token_pos is None:
-            ans = [1] * len(ids)
-        else:
-            ans = [1] * (last_non_pad_token_pos + 1)
-        ans = ans + [0] * (len(ids) - len(ans))
-    assert len(ans) == len(ids)
-    return ans
-
-
-def create_causal_mask_mapping(
-    model_config,
-    batch_size,
-    seq_len,
-    position_ids=None,
-    attention_mask=None,
-    device=None,
-):
-    """
-    Create causal mask mapping for pipeline parallelism.
-
-    This is the core mask creation logic that can be reused by different collate functions.
-    Extracts common mask creation logic to avoid duplication between collate functions.
-
-    Args:
-        model_config: HuggingFace model config
-        batch_size: Batch size
-        seq_len: Sequence length
-        position_ids: Optional position IDs tensor [batch_size, seq_len]
-        attention_mask: Optional 2D attention mask tensor [batch_size, seq_len] for padding
-        device: Device to create tensors on (defaults to cpu)
-
-    Returns:
-        dict: Mapping of mask types to 4D mask tensors
-            - "full_attention": [batch_size, 1, seq_len, seq_len]
-            - "sliding_attention": [batch_size, 1, seq_len, seq_len] (if model uses sliding window)
-    """
-    if device is None:
-        device = torch.device("cpu")
-
-    # Create position_ids if not provided
-    if position_ids is None:
-        position_ids = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
-
-    # Prepare mask creation kwargs
-    mask_kwargs = {
-        "config": model_config,
-        "input_embeds": torch.empty((batch_size, seq_len), device=device),
-        "attention_mask": attention_mask,
-        "cache_position": position_ids[0],  # Use first row (all rows identical for non-padded data)
-        "past_key_values": None,  # Training only
-        "position_ids": position_ids,
-    }
-
-    # Create causal masks
-    causal_mask_mapping = {
-        "full_attention": create_causal_mask(**mask_kwargs),
-    }
-
-    # Add sliding window mask if model uses it
-    if hasattr(model_config, "sliding_window") and model_config.sliding_window is not None:
-        causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
-
-    return causal_mask_mapping
-
-
-def add_causal_masks_to_batch(batch_dict, model_config):
-    """
-    Add precomputed causal masks to an already-batched data dict.
-
-    This function is designed for datasets that yield complete batches (like MockIterableDataset),
-    where we want to add mask precomputation as a separate processing step.
-
-    Args:
-        batch: A dict or list containing a single batched dict with tensors:
-            - input_ids: [batch_size, seq_length]
-            - position_ids: [batch_size, seq_length] (optional)
-            - labels: [batch_size, seq_length]
-        model_config: HuggingFace model config for creating causal masks
-        precompute_masks: If False, skip mask creation (for compatibility with train_ft.py wrapper)
-
-    Returns:
-        dict: Same batch with added causal_mask_mapping field
-    """
-    # Extract info from batch
-    batch_size = batch_dict["input_ids"].shape[0]
-    seq_len = batch_dict["input_ids"].shape[1]
-    position_ids = batch_dict.get("position_ids")
-    attention_mask = batch_dict.get("attention_mask")  # May have padding info
-
-    # Create causal masks using the shared helper function
-    causal_mask_mapping = create_causal_mask_mapping(
-        model_config=model_config,
-        batch_size=batch_size,
-        seq_len=seq_len,
-        position_ids=position_ids,
-        attention_mask=attention_mask,
-        device=batch_dict["input_ids"].device,
-    )
-
-    batch_dict["causal_mask_mapping"] = causal_mask_mapping
-    return batch_dict
-
-
-def default_collater(batch, pad_seq_len_divisible=None):
-    """
-    Default batch collator that handles padding and batching.
-
-    Args:
-        batch: A batch of examples.
-        pad_seq_len_divisible: If provided, pad sequence length to be divisible by this value.
-
-    Returns:
-        dict: A dictionary containing batched tensors.
-    """
-    pad_token_ids = batch[0].pop("___PAD_TOKEN_IDS___", None)
-    # ans contains a dict with:
-    # key: str (e.g., "input_ids", "attention_mask", "labels", "loss_mask")
-    # value: list[list[int]] (e.g., [[1, 2, 3], [4, 5, 6]])
-    ans = {
-        key: pad_within_micro(
-            extract_key_from_dicts(batch, key),
-            get_pad_token_from_key(key, pad_token_ids),
-            pad_seq_len_divisible,
-        )
-        for key in batch[0].keys()
-    }
-
-    # convert to tensors
-    result = {k: batchify(torch.LongTensor(v)) for k, v in ans.items()}
-
-    # Add padding_mask similar to cp_utils.py
-    if "input_ids" in result:
-        input_ids_pad_token = get_pad_token_from_key("input_ids", pad_token_ids) or 0
-        result["padding_mask"] = (result["input_ids"] == input_ids_pad_token).bool()
-
-    return result
-
-
-def packed_sequence_thd_collater(batch):
-    """
-    Collater for packed sequences in THD (total, hidden, depth) format.
-
-    This collater is designed for THD format, where multiple variable-length
-    sequences are concatenated with/without padding tokens between them. The THD format represents
-    sequences as (total_tokens, hidden_dim, depth) where total_tokens is the sum of all sequence
-    lengths in the batch.
-
-    Unlike traditional padding-based approaches (BSHD/SBHD formats), this THD format:
-    - Concatenates sequences directly: [a a a b b c c c c]
-    - Uses seq_lens to identify sequence boundaries for attention computation
-    - Supports optional identifier or padding tokens between sequences via seq_lens_padded
-
-    This collater supports both pipeline parallelism (PP) and non-PP use cases by:
-    - Stacking token-level tensors (input_ids, labels, position_ids) along batch dimension
-    - Padding and stacking seq_lens and seq_lens_padded with sentinel value -1000
-    - Including 'qkv_format': 'thd' in the output to indicate THD format
-
-    When batch items lack packed-sequence metadata (seq_lens, seq_lens_padded, position_ids),
-    such as samples from ChatDataset, this collater synthesizes the missing fields so that each
-    sample is treated as a single-sequence "pack". Variable-length sequences are padded to the
-    longest length in the batch. This enables using THD format with TE context parallelism
-    without requiring the dataset to perform actual sequence packing.
-
-    Args:
-        batch (List[dict]): A list of dictionaries, where each dictionary represents one example.
-
-            For pre-packed data, each dictionary should contain:
-            - 'input_ids': List[int] - Token IDs for all packed sequences (must be same length across batch)
-            - 'labels': List[int] - Labels for all packed sequences (must be same length across batch)
-            - 'position_ids': List[int] - Position IDs for all tokens (must be same length across batch)
-            - 'seq_lens': List[int] - Actual sequence lengths for each packed sequence
-            - 'seq_lens_padded': List[int] - Sequence lengths including identifier/padding tokens
-
-            For non-packed data (e.g. ChatDataset), each dictionary needs only:
-            - 'input_ids': List[int] - Token IDs (variable length across batch)
-            - 'labels': List[int] - Labels (same length as input_ids)
-            - 'attention_mask': List[int] - (optional) 1 for real tokens, 0 for padding
-
-            Example batch with 2 packed examples, both with 6 total tokens:
-            [
-                {
-                    'input_ids': [1, 2, 3, 99, 4, 5],  # Two sequences: [1,2,3] and [4,5] with sep token 99
-                    'labels': [1, 2, 3, -100, 4, 5],
-                    'position_ids': [0, 1, 2, 0, 0, 1],
-                    'seq_lens': [3, 2],  # Actual sequence lengths (excluding separator)
-                    'seq_lens_padded': [4, 2]  # Including separator token
-                },
-                {
-                    'input_ids': [6, 7, 99, 8, 9, 10],  # Two sequences with separator
-                    'labels': [6, 7, -100, 8, 9, 10],
-                    'position_ids': [0, 1, 0, 0, 1, 2],
-                    'seq_lens': [2, 3],
-                    'seq_lens_padded': [3, 3]
-                }
-            ]
-
-    Returns:
-        dict: A dictionary with batched tensors:
-            - 'input_ids': tensor of shape [batch_size, seq_len] - stacked token sequences
-            - 'labels': tensor of shape [batch_size, seq_len] - stacked labels
-            - 'position_ids': tensor of shape [batch_size, seq_len] - stacked position IDs
-            - 'seq_lens': tensor of shape [batch_size, max_num_packs] - padded sequence lengths
-            - 'seq_lens_padded': tensor of shape [batch_size, max_num_packs] - padded lengths with separators
-            - 'qkv_format': str - Always 'thd' to indicate THD format
-
-        Note: seq_lens and seq_lens_padded are padded with -1000 to handle variable number of
-        packed sequences per example. These sentinel values should be filtered out before use.
-    """
-    # Extract and remove padding token metadata if present
-    pad_token_ids = None
-    if len(batch) > 0 and "___PAD_TOKEN_IDS___" in batch[0]:
-        pad_token_ids = batch[0].get("___PAD_TOKEN_IDS___")
-        for item in batch:
-            item.pop("___PAD_TOKEN_IDS___", None)
-
-    if len(batch) == 0:
-        return {}
-
-    # If batch items lack packed-sequence metadata (e.g. from ChatDataset),
-    # synthesize seq_lens, seq_lens_padded, and position_ids so that each
-    # sample is treated as a single-sequence "pack".
-    if "seq_lens" not in batch[0]:
-        input_ids_pad = get_pad_token_from_key("input_ids", pad_token_ids) or 0
-        max_len = max(len(item["input_ids"]) for item in batch)
-
-        for item in batch:
-            cur_len = len(item["input_ids"])
-            if "attention_mask" in item:
-                actual_len = sum(item["attention_mask"])
-                item.pop("attention_mask")
-            else:
-                actual_len = cur_len
-
-            pad_amount = max_len - cur_len
-            item["seq_lens"] = [actual_len]
-            # seq_lens_padded must cover the full padded length so that
-            # cu_seqlens_padded[-1] == total_tokens in the downstream THD pipeline.
-            item["seq_lens_padded"] = [max_len]
-            item["position_ids"] = list(range(max_len))
-
-            if pad_amount > 0:
-                item["input_ids"] = list(item["input_ids"]) + [input_ids_pad] * pad_amount
-                item["labels"] = list(item["labels"]) + [-100] * pad_amount
-
-    tokens = batchify(torch.stack([torch.tensor(x["input_ids"]) for x in batch]))
-    labels = batchify(torch.stack([torch.tensor(x["labels"]) for x in batch]))
-    position_ids = batchify(torch.stack([torch.tensor(x["position_ids"]) for x in batch]))
-
-    seq_lens = batchify(torch.LongTensor(pad_within_micro([x["seq_lens"] for x in batch], -1000)))
-    seq_lens_padded = batchify(torch.LongTensor(pad_within_micro([x["seq_lens_padded"] for x in batch], -1000)))
-
-    return {
-        "input_ids": tokens,
-        "labels": labels,
-        "position_ids": position_ids,
-        "seq_lens": seq_lens,
-        "seq_lens_padded": seq_lens_padded,
-        "qkv_format": "thd",
-    }
-
-
-def _indexed_mask_to_4d_block_causal(attention_mask: torch.Tensor) -> torch.Tensor:
-    """Convert an indexed attention mask to a 4D block-causal mask.
-
-    Args:
-        attention_mask: Integer tensor of shape ``[B, S]`` where each
-            position contains the 1-based index of the sub-sequence it
-            belongs to (0 = padding).
-
-    Returns:
-        Bool tensor of shape ``[B, 1, S, S]`` suitable for
-        ``eager`` / ``sdpa`` attention backends.  ``True`` means the
-        position is **allowed** to attend.
-    """
-    # attention_mask: [B, S]
-    B, S = attention_mask.shape
-
-    # same_doc[b, i, j] = True iff positions i and j belong to the same sub-sequence
-    mask_q = attention_mask.unsqueeze(2)  # [B, S, 1]
-    mask_k = attention_mask.unsqueeze(1)  # [B, 1, S]
-    same_doc = mask_q == mask_k  # [B, S, S]
-
-    # causal: position i can attend to position j only if j <= i
-    causal = torch.ones(S, S, dtype=torch.bool, device=attention_mask.device).tril()  # [S, S]
-
-    # not_padding: both positions must be non-padding (index > 0)
-    not_padding_q = (attention_mask > 0).unsqueeze(2)  # [B, S, 1]
-    not_padding_k = (attention_mask > 0).unsqueeze(1)  # [B, 1, S]
-
-    mask_4d = same_doc & causal.unsqueeze(0) & not_padding_q & not_padding_k  # [B, S, S]
-
-    return mask_4d.unsqueeze(1)  # [B, 1, S, S]
-
-
-def neat_packed_collater(batch: list[dict], attn_implementation: str = "sdpa") -> dict:
-    """Collater for neat-packed LLM sequences.
-
-    Stacks ``input_ids``, ``labels``, ``position_ids`` and converts the
-    indexed ``attention_mask`` to the format required by the attention backend.
-
-    For ``flash_attention_2``: keeps the indexed 2D mask ``[B, S]``.
-    For ``sdpa`` / ``eager``: converts to a 4D block-causal float mask.
-
-    Args:
-        batch: List of sample dicts produced by ``neat_pack_dataset``.
-        attn_implementation: Attention backend (``"flash_attention_2"``,
-            ``"sdpa"``, or ``"eager"``).
-
-    Returns:
-        Dict with batched tensors ready for model forward.
-    """
-    if not batch:
-        return {}
-
-    input_ids = batchify(torch.stack([torch.as_tensor(x["input_ids"]) for x in batch]))
-    labels = batchify(torch.stack([torch.as_tensor(x["labels"]) for x in batch]))
-    position_ids = batchify(torch.stack([torch.as_tensor(x["position_ids"]) for x in batch]))
-    attention_mask = batchify(torch.stack([torch.as_tensor(x["attention_mask"]) for x in batch]))
-
-    if attn_implementation == "flash_attention_2":
-        mask_out = attention_mask
-    else:
-        mask_out = _indexed_mask_to_4d_block_causal(attention_mask)
-
-    return {
-        "input_ids": input_ids,
-        "labels": labels,
-        "position_ids": position_ids,
-        "attention_mask": mask_out,
-    }
-
-
-class SFTSingleTurnPreprocessor:
-    """
-    Generic single-turn text-to-text SFT (supervised-fine-tuning) pre-processor.
-
-    Args:
-        tokenizer: Pre-trained tokenizer (HF).
-    """
-
-    def __init__(self, tokenizer):
-        """
-        SFTSingleTurnPreprocessor constructor.
-
-        Args:
-            tokenizer: Pretrained tokenizer.
-        """
-        self.tokenizer = tokenizer
-        self.block_size = None
-        self.preprocessing_num_workers = 1
-        self.overwrite_cache = False
-        self.pad_to_max_length = True
-
-    def _tokenize_function(self, examples, dataset):
-        ctx = dataset.get_context(examples)
-        tgt = dataset.get_target(examples)
-
-        ctx_tok = self.tokenizer(ctx)
-        tgt_tok = self.tokenizer(tgt)
-
-        # strip trailing special token from context
-        if len(ctx_tok["input_ids"][0]) > 0 and ctx_tok["input_ids"][0][-1] in self.tokenizer.all_special_ids:
-            ctx_tok["input_ids"] = [ids[:-1] for ids in ctx_tok["input_ids"]]
-            ctx_tok["attention_mask"] = [m[:-1] for m in ctx_tok["attention_mask"]]
-
-        # strip leading special token from target
-        if len(tgt_tok["input_ids"][0]) > 0 and tgt_tok["input_ids"][0][0] in self.tokenizer.all_special_ids:
-            tgt_tok["input_ids"] = [ids[1:] for ids in tgt_tok["input_ids"]]
-            tgt_tok["attention_mask"] = [m[1:] for m in tgt_tok["attention_mask"]]
-
-        out = {}
-        out["input_ids"] = [
-            c_ids + t_ids for c_ids, t_ids in zip(ctx_tok["input_ids"], tgt_tok["input_ids"], strict=False)
-        ]
-        out["attention_mask"] = [
-            c_m + t_m for c_m, t_m in zip(ctx_tok["attention_mask"], tgt_tok["attention_mask"], strict=False)
-        ]
-        # label: -100 for ctx, true ids for tgt
-        out["labels"] = [
-            [-100] * (len(c_ids) - 1) + t_ids + [-100]
-            for c_ids, t_ids in zip(ctx_tok["input_ids"], tgt_tok["input_ids"], strict=False)
-        ]
-
-        out["loss_mask"] = [[1 if t != -100 else 0 for t in lbl] for lbl in out["labels"]]
-        return out
-
-    def _compute_dataset_max_len(self, tokenized_ds):
-        max_len = max(map(lambda x: len(x["input_ids"]), tokenized_ds))
-        # make multiple of 8
-        max_len = math.ceil(max_len / 8) * 8
-        # respect model block size
-        if self.block_size is not None:
-            max_len = min(max_len, self.block_size)
-        return max_len
-
-    def _pad_function(self, max_len):
-        tk = self.tokenizer
-
-        def _pad(examples):
-            pad_id = tk.pad_token_id or 0
-            examples["input_ids"] = [
-                (ids[:max_len] + [pad_id] * max(0, max_len - len(ids))) for ids in examples["input_ids"]
-            ]
-            examples["attention_mask"] = [
-                ([1] * min(len(ids), max_len) + [0] * max(0, max_len - len(ids))) for ids in examples["attention_mask"]
-            ]
-            examples["labels"] = [(lbl[:max_len] + [-100] * max(0, max_len - len(lbl))) for lbl in examples["labels"]]
-            examples["loss_mask"] = [(lm[:max_len] + [0] * max(0, max_len - len(lm))) for lm in examples["loss_mask"]]
-            # return dictionary with sequences all exactly `max_len` long
-            return examples
-
-        return _pad
-
-    def process(self, raw_dataset, ds):
-        """
-        Main processor entry.
-
-        Args:
-            raw_dataset (datasets.DatasetDict): the dataset (e.g. returned by load_dataset)
-            ds (dataset): the dataset with get_target method.
-
-        Returns:
-            datasets.DatasetDict: tokenized + optionally padded datasets (all splits preserved).
-        """
-        if not hasattr(self.tokenizer, "pad_token") and hasattr(self.tokenizer, "bos_token"):
-            self.tokenizer.pad_token = self.tokenizer.bos_token
-
-        # 1. tokenise
-        tokenized = raw_dataset.map(
-            lambda x: self._tokenize_function(x, dataset=ds),
-            batched=True,
-            num_proc=self.preprocessing_num_workers,
-            remove_columns=raw_dataset.column_names,
-            load_from_cache_file=not self.overwrite_cache,
-            desc="Running tokenizer on dataset",
-        )
-
-        # 2. pad (optional)
-        if self.pad_to_max_length:
-            # 2a. compute global max len
-            max_len = self._compute_dataset_max_len(tokenized)
-
-            # 2b. pad to max len
-            pad_fn = self._pad_function(max_len)
-            tokenized = tokenized.map(
-                pad_fn,
-                batched=True,
-                num_proc=self.preprocessing_num_workers,
-                load_from_cache_file=not self.overwrite_cache,
-                desc=f"Padding dataset to max length {max_len}",
-            )
-
-        return tokenized
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/datasets/llm/chat_dataset.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
-
-from datasets import VerificationMode, load_dataset
-from torch.utils.data import Dataset
-
-from nemo_automodel.components.datasets.llm.formatting_utils import (
-    _add_pad_token,
-    _has_chat_template,
-    _resolve_chat_template,
-    format_chat_template,
-)
-
-
-def _is_hf_repo_id(val: str) -> bool:
-    # Basic check: org/name without local path existing
-    if "/" not in val:
-        return False
-    p = Path(val)
-    return not p.exists() and all(part for part in val.split("/"))
-
-
-def _as_iter(val: Union[str, Sequence[str]]) -> Iterator[str]:
-    if isinstance(val, str):
-        yield val
-    else:
-        for x in val:
-            if not isinstance(x, str):
-                raise ValueError("data_files entries must be strings")
-            yield x
-
-
-_SPLIT_SLICE_RE = re.compile(r"^(\w+)\[(\d*):(\d*)\]$")
-
-
-def _parse_split_slice(split: Optional[str]):
-    """Parse a split string like ``"train[1024:]"`` into ``(base_split, slice | None)``."""
-    if split is None:
-        return split, None
-    match = _SPLIT_SLICE_RE.match(split)
-    if not match:
-        return split, None
-    base = match.group(1)
-    start = int(match.group(2)) if match.group(2) else None
-    end = int(match.group(3)) if match.group(3) else None
-    return base, slice(start, end)
-
-
-def _load_openai_messages(
-    path_or_dataset_id: Union[str, Sequence[str]],
-    split: Optional[str] = None,
-    name: Optional[str] = None,
-    shuffle_seed: Optional[int] = None,
-    skip_invalid_samples: bool = False,
-):
-    """Load OpenAI chat messages datasets from HF or local JSON/JSONL files.
-
-    For HF repo IDs, we delegate to datasets.load_dataset.  When *split*
-    is provided, the full base split is loaded and shuffled *before* any
-    slice (e.g. ``[1024:]``) is applied so that train/val splits sample
-    from a consistent random order.  When *split* is ``None`` it is passed
-    through to ``load_dataset`` as-is (no default override).
-
-    For local files, we manually parse JSONL/JSON to avoid pyarrow type
-    inference issues (e.g., heterogeneous field types under `tools`).
-
-    Args:
-        path_or_dataset_id: HF dataset ID or local file path(s).
-        split: Dataset split to load (e.g., "train", "train[1024:]").
-        name: Dataset configuration/subset name
-        shuffle_seed: Random seed for shuffling HF datasets before slicing.
-            Set to ``None`` to disable shuffling.
-        skip_invalid_samples: If ``True``, skip malformed JSONL lines for local
-            files instead of failing fast.
-    """
-    if isinstance(path_or_dataset_id, str) and _is_hf_repo_id(path_or_dataset_id):
-        base_split, sl = _parse_split_slice(split)
-
-        dataset = load_dataset(
-            path_or_dataset_id,
-            name=name,
-            split=base_split,
-            streaming=False,
-            verification_mode=VerificationMode.NO_CHECKS,
-        )
-        if shuffle_seed is not None:
-            dataset = dataset.shuffle(seed=shuffle_seed)
-
-        if sl is not None:
-            indices = range(*sl.indices(len(dataset)))
-            dataset = dataset.select(indices)
-
-        return dataset
-
-    # Handle local directories and Parquet files via load_dataset.
-    # This covers pre-filtered cached datasets saved as Parquet.
-    if isinstance(path_or_dataset_id, str):
-        p = Path(path_or_dataset_id)
-        is_parquet_file = p.is_file() and p.suffix.lower() == ".parquet"
-        is_dataset_dir = p.is_dir() and any(p.glob("*.parquet"))
-
-        if is_parquet_file or is_dataset_dir:
-            logging.getLogger(__name__).info("Loading local dataset from %s via load_dataset", path_or_dataset_id)
-            base_split, sl = _parse_split_slice(split)
-
-            load_path = str(p.parent) if is_parquet_file else str(p)
-            # Cached Parquet datasets (from prefilter_dataset.py) are saved as a single
-            # split. Default to "train" when split is unspecified or was stripped to
-            # extract a slice (e.g. "train[:128]" → base_split="train", sl=slice(None,128)).
-            dataset = load_dataset(
-                load_path,
-                name=name,
-                split=base_split or "train",
-                data_files=p.name if is_parquet_file else None,
-                verification_mode=VerificationMode.NO_CHECKS,
-            )
-
-            if shuffle_seed is not None:
-                dataset = dataset.shuffle(seed=shuffle_seed)
-            if sl is not None:
-                indices = range(*sl.indices(len(dataset)))
-                dataset = dataset.select(indices)
-            return dataset
-
-    # Fall back to manual JSON/JSONL parsing for local files.
-    files = list(_as_iter(path_or_dataset_id))
-    if not files:
-        raise RuntimeError("No data files provided")
-
-    rows: List[Dict[str, Any]] = []
-
-    def _read_file(fp: str) -> None:
-        p = Path(fp)
-        if not p.exists():
-            raise FileNotFoundError(f"File not found: {fp}")
-        text = p.read_text(encoding="utf-8")
-        if p.suffix.lower() in {".jsonl", ".ndjson"}:
-            skipped_lines = 0
-            for line in text.splitlines():
-                line = line.strip()
-                if not line:
-                    continue
-                try:
-                    rows.append(json.loads(line))
-                except json.JSONDecodeError:
-                    if not skip_invalid_samples:
-                        raise
-                    skipped_lines += 1
-            if skipped_lines:
-                logging.getLogger(__name__).warning(
-                    "Skipped %d malformed JSONL line(s) from %s (skip_invalid_samples=True)",
-                    skipped_lines,
-                    fp,
-                )
-        else:
-            obj = json.loads(text)
-            if isinstance(obj, list):
-                rows.extend(obj)
-            else:
-                rows.append(obj)
-
-    for f in files:
-        _read_file(f)
-
-    return rows
-
-
-def _normalize_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Ensure messages list is valid and content fields are strings for system/user/assistant.
-
-    - Keeps tool_calling fields if present (e.g., tool calls in assistant messages, tool role messages).
-    - If content is a list of parts, only keep text parts.
-    """
-
-    def _normalize_content(value: Any) -> str:
-        if isinstance(value, list):
-            return " ".join(part["text"] for part in value if isinstance(part, dict) and "text" in part)
-        if value is None:
-            return ""
-        return str(value)
-
-    def _normalize_tool_calls(tool_calls: Any) -> List[Dict[str, Any]]:
-        if not isinstance(tool_calls, list):
-            raise ValueError("assistant message `tool_calls` must be a list")
-
-        normalized_tool_calls: List[Dict[str, Any]] = []
-        for idx, tool_call in enumerate(tool_calls):
-            if not isinstance(tool_call, dict):
-                raise ValueError(f"assistant message `tool_calls[{idx}]` must be a dict")
-
-            tool_call_id = tool_call.get("id")
-            if not isinstance(tool_call_id, str) or not tool_call_id:
-                raise ValueError(f"assistant message `tool_calls[{idx}].id` must be a non-empty string")
-
-            tool_call_type = tool_call.get("type")
-            if not isinstance(tool_call_type, str) or not tool_call_type:
-                raise ValueError(f"assistant message `tool_calls[{idx}].type` must be a non-empty string")
-
-            function = tool_call.get("function")
-            if not isinstance(function, dict):
-                raise ValueError(f"assistant message `tool_calls[{idx}].function` must be a dict")
-
-            function_name = function.get("name")
-            if not isinstance(function_name, str) or not function_name:
-                raise ValueError(f"assistant message `tool_calls[{idx}].function.name` must be a non-empty string")
-
-            function_arguments = function.get("arguments")
-            if function_arguments is None:
-                raise ValueError(f"assistant message `tool_calls[{idx}].function.arguments` is required")
-
-            normalized_function = dict(function)
-            if not isinstance(function_arguments, str):
-                normalized_function["arguments"] = json.dumps(function_arguments)
-
-            normalized_tool_call = dict(tool_call)
-            normalized_tool_call["function"] = normalized_function
-            normalized_tool_calls.append(normalized_tool_call)
-
-        return normalized_tool_calls
-
-    norm: List[Dict[str, Any]] = []
-    for m in messages:
-        role = m.get("role")
-        out = dict(m)
-        if role not in {"system", "user", "assistant", "tool"}:
-            raise ValueError(f"Unsupported role in messages: {role}")
-
-        out["content"] = _normalize_content(m.get("content"))
-
-        if role == "assistant":
-            if "reasoning_content" in m:
-                reasoning_content = m.get("reasoning_content")
-                if reasoning_content is None:
-                    out["reasoning_content"] = ""
-                else:
-                    if not isinstance(reasoning_content, str):
-                        raise ValueError("assistant message `reasoning_content` must be a string when provided")
-                    out["reasoning_content"] = reasoning_content
-            if "tool_calls" in m:
-                out["tool_calls"] = _normalize_tool_calls(m.get("tool_calls"))
-
-        if role == "tool":
-            tool_call_id = m.get("tool_call_id")
-            if not isinstance(tool_call_id, str) or not tool_call_id:
-                raise ValueError("tool message `tool_call_id` must be a non-empty string")
-
-        norm.append(out)
-    return norm
-
-
-class ChatDataset(Dataset):
-    """Dataset for OpenAI-format tool-calling chat transcripts.
-
-    This class expects each row to contain a `messages` list in OpenAI chat format,
-    potentially including tool calls and tool responses. The datasetformats the
-    conversation via the tokenizer's chat template to produce `input_ids`, `labels`,
-    and `attention_mask` suitable for SFT.
-    """
-
-    def __init__(
-        self,
-        path_or_dataset_id: Union[str, Sequence[str]],
-        tokenizer,
-        *,
-        split: Optional[str] = None,
-        name: Optional[str] = None,
-        seq_length: Optional[int] = None,
-        padding: Union[str, bool] = "do_not_pad",
-        truncation: Union[str, bool] = "do_not_truncate",
-        start_of_turn_token: Optional[str] = None,
-        chat_template: Optional[str] = None,
-        shuffle_seed: Optional[int] = None,
-        mask_reasoning_content: bool = False,
-        unshifted: bool = False,
-        skip_invalid_samples: bool = False,
-    ) -> None:
-        """Load OpenAI-format chat rows and tokenize via the chat template.
-
-        Args:
-            path_or_dataset_id: Hugging Face dataset id, local JSON/JSONL path(s), Parquet file, or Parquet directory.
-            tokenizer: Tokenizer with chat template support (required).
-            split: Dataset split or slice (e.g. ``train``, ``train[1024:]``).
-            name: Optional Hub subset / config name.
-            seq_length: Maximum sequence length for padding and truncation in formatting.
-            padding: Padding mode for ``format_chat_template``.
-            truncation: Truncation mode for ``format_chat_template``.
-            start_of_turn_token: Optional token marking assistant turns for answer-only loss.
-            chat_template: Optional Jinja template string overriding ``tokenizer.chat_template``.
-            shuffle_seed: If set, shuffles Hub/Parquet data before applying a split slice.
-            mask_reasoning_content: If ``True``, exclude rendered reasoning traces from the loss mask.
-            unshifted: Passed through to ``format_chat_template``.
-            skip_invalid_samples: If ``True``, skip malformed JSONL lines when reading local files (warning logs
-                include skip counts). If ``False``, a bad line raises. Does not skip invalid structured rows after
-                load; those still raise when a sample is accessed.
-        """
-        if tokenizer is None:
-            raise ValueError("Tokenizer is required")
-
-        # Enforce chat-template availability for tool-calling data
-        if chat_template is not None:
-            tokenizer.chat_template = _resolve_chat_template(chat_template)
-
-        if not _has_chat_template(tokenizer):
-            raise ValueError("ChatDataset requires a tokenizer with chat template support.")
-
-        self.tokenizer = tokenizer
-        self.seq_length = seq_length
-        self.padding = padding
-        self.truncation = truncation
-        self.start_of_turn_token = start_of_turn_token
-        self.mask_reasoning_content = mask_reasoning_content
-        self.unshifted = unshifted
-        self.skip_invalid_samples = skip_invalid_samples
-
-        self.dataset = _load_openai_messages(
-            path_or_dataset_id,
-            split=split,
-            name=name,
-            shuffle_seed=shuffle_seed,
-            skip_invalid_samples=skip_invalid_samples,
-        )
-
-        # Ensure pad token presence for downstream padding
-        eos_token_id = getattr(self.tokenizer, "eos_token_id", 0)
-        self.pad_token_id = _add_pad_token(self.tokenizer) or eos_token_id
-
-    def __len__(self) -> int:
-        return len(self.dataset)
-
-    def __getitem__(self, idx: int) -> Dict[str, List[int]]:
-        row = self.dataset[idx]
-        messages = row.get("messages")
-        if not isinstance(messages, list):
-            raise ValueError("Each sample must contain a `messages` list in OpenAI format")
-
-        normalized = _normalize_messages(messages)
-        tools = row.get("tools")
-        if tools is not None and not isinstance(tools, list):
-            tools = None
-
-        eos_token_id = getattr(self.tokenizer, "eos_token_id", 0)
-        sample = format_chat_template(
-            self.tokenizer,
-            normalized,
-            eos_token_id,
-            self.pad_token_id,
-            seq_length=self.seq_length,
-            padding=self.padding,
-            truncation=self.truncation,
-            tools=tools,
-            mask_reasoning_content=self.mask_reasoning_content,
-            unshifted=self.unshifted,
-        )
-        return sample
-
-```
-
-File: /Users/mromeijn/src/Automodel/nemo_automodel/components/datasets/llm/column_mapped_text_instruction_dataset.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import re
-from enum import Enum
-from pathlib import Path
-from typing import Dict, Iterator, List, Optional, Union
-
-from datasets import VerificationMode, load_dataset
-from torch.utils.data import Dataset
-
-from nemo_automodel.components.datasets.llm.formatting_utils import (
-    _add_pad_token,
-    _has_chat_template,
-    format_chat_template,
-    format_prompt_completion,
-)
-
-logger = logging.getLogger(__name__)
-
-# Supported cases:
-# Format:
-# - Context + question + answer
-# - Question + answer
-# Input types:
-# - one or more paths to jsonl files
-# - dataset id from huggingface.
-
-
-class ColumnTypes(Enum):
-    Context = "context"
-    Question = "question"
-    Answer = "answer"
-
-
-def make_iterable(val: Union[str, List[str]]) -> Iterator[str]:
-    """Utility that converts *val* into an iterator of strings.
-
-    The helper accepts either a single string or a list of strings and
-    yields its contents. This is handy when we want to treat the two cases
-    uniformly downstream (e.g. when iterating over *data_files* that can be
-    provided as either a single path or a collection of paths).
-
-    Args:
-        val: Either a single string or a list/tuple of strings.
-
-    Yields:
-        str: The individual strings contained in *val*.
-
-    Raises:
-        ValueError: If *val* is neither a string nor an iterable of strings.
-    """
-    if isinstance(val, str):
-        yield val
-    elif isinstance(val, (list, tuple)):
-        for item in val:
-            if not isinstance(item, str):
-                raise ValueError("All elements must be strings")
-            yield item
-    else:
-        raise ValueError(f"Expected str or list[str], got {type(val)}")
-
-
-def _str_is_hf_repo_id(val: str) -> bool:
-    """
-    Check if a string is a valid huggingface dataset id.
-
-    Args:
-        val: A string to check.
-
-    Returns:
-        True if the string is a valid huggingface dataset id, False otherwise.
-    """
-    return re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", val) is not None and not Path(val).exists()
-
-
-def _load_dataset(
-    path_or_dataset_id: Union[str, List[str]],
-    split: Optional[str] = None,
-    streaming: bool = False,
-    name: Optional[str] = None,
-):
-    """Load a dataset either from the Hugging Face Hub or from local JSON/JSONL files.
-
-    If *path_or_dataset_id* resembles a HF repo ID (i.e. of the form
-    ``org/dataset`` and the path does **not** exist on the local filesystem),
-    we defer to ``datasets.load_dataset`` directly. Otherwise, we assume the
-    argument points to one or more local JSON/JSONL files and let
-    ``datasets.load_dataset`` with the *"json"* script handle the parsing.
-
-    Args:
-        path_or_dataset_id: Either a HF dataset identifier (``org/name``) or
-            a path / list of paths to local ``.json`` / ``.jsonl`` files.
-        split: Optional split to load when retrieving a remote dataset. This
-            parameter is ignored for local files as the *json* script always
-            returns a single split.
-        streaming: Whether to stream the dataset.
-        name: Optional name of the dataset configuration/subset to load
-
-    Returns:
-        datasets.Dataset: The loaded dataset.
-    """
-    if isinstance(path_or_dataset_id, str) and _str_is_hf_repo_id(path_or_dataset_id):
-        return load_dataset(
-            path_or_dataset_id,
-            name=name,
-            split=split,
-            streaming=streaming,
-            verification_mode=VerificationMode.NO_CHECKS,
-        )
-
-    data_files = list(make_iterable(path_or_dataset_id))
-    if not data_files:
-        raise RuntimeError("No data files provided")
-
-    return load_dataset(
-        "json", data_files=data_files, split="train", streaming=streaming, verification_mode=VerificationMode.NO_CHECKS
-    )
-
-
-def _check_all_values_equal_length(sample: Dict[str, List[int]]) -> bool:
-    """
-    Check if all values in the sample are of the same length.
-    """
-    len0 = len(sample[next(iter(sample))])
-    all_equal = True
-    for k, v in sample.items():
-        if k == "___PAD_TOKEN_IDS___":
-            continue
-        if len(v) != len0:
-            all_equal = False
-            break
-    return all_equal
-
-
-class ColumnMappedTextInstructionDataset(Dataset):
-    """Generic instruction-tuning dataset that maps arbitrary column names.
-
-    The class is intentionally lightweight: it simply loads the raw samples
-    (either from HF or from local JSON/JSONL files) and remaps the columns so
-    that downstream components can rely on a consistent field interface.
-
-    Optionally, if *answer_only_loss_mask* is requested, the dataset will also
-    compute a *loss_mask* indicating which tokens should contribute to the
-    loss (typically only those belonging to the assistant answer).
-    """
-
-    def __init__(
-        self,
-        path_or_dataset_id: Union[str, List[str]],
-        column_mapping: Dict[str, str],
-        tokenizer,
-        *,
-        split: Optional[str] = "train",
-        name: Optional[str] = None,
-        answer_only_loss_mask: bool = True,
-        seq_length: Optional[int] = None,
-        padding: Union[str, bool] = "do_not_pad",
-        truncation: Union[str, bool] = "do_not_truncate",
-        limit_dataset_samples: Optional[int] = None,
-        use_hf_chat_template: bool = False,
-    ) -> None:
-        """
-        Initialize the dataset.
-
-        Args:
-            path_or_dataset_id: The path or dataset id of the dataset.
-            column_mapping: The mapping of the columns.
-            tokenizer: The tokenizer to use.
-            split: The split of the dataset to load.
-            name: The name of the dataset configuration/subset to load
-            answer_only_loss_mask: Whether to compute the loss mask only on the answer tokens.
-            seq_length: The sequence length to use for padding.
-            limit_dataset_samples: The number of samples to load from the dataset.
-        """
-
-        if use_hf_chat_template and _has_chat_template(tokenizer):
-            if not answer_only_loss_mask:
-                logging.warning(
-                    "answer_only_loss_mask=False but tokenizer has chat template. Consider providing `answer_only_loss_mask`."
-                )
-
-        assert tokenizer is not None, "Tokenizer is required"
-        self.tokenizer = tokenizer
-        if getattr(self.tokenizer, "pad_token", None) is None:
-            if hasattr(self.tokenizer, "eos_token"):
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            else:
-                logger.warning("Setting tokenizer pad_token to ' '. tokenizer does not have `eos_token`.")
-                self.tokenizer.pad_token = " "
-
-        self.dataset = _load_dataset(path_or_dataset_id, split=split, streaming=False, name=name)
-
-        if limit_dataset_samples is not None:
-            self.dataset = self.dataset.select(range(limit_dataset_samples))
-
-        # Keep mapping: dest -> source (i.e. public_field -> raw_column_name)
-
-        assert isinstance(column_mapping, dict), "Expected column_mapping to be a dictionary"
-        # Ensure required columns are present
-        assert ColumnTypes.Answer.value in column_mapping, ("Expected answer to be in column_mapping", column_mapping)
-        if len(column_mapping) == 3:
-            assert ColumnTypes.Context.value in column_mapping, (
-                "Expected context to be in column_mapping",
-                column_mapping,
-            )
-            assert ColumnTypes.Question.value in column_mapping, (
-                "Expected question to be in column_mapping",
-                column_mapping,
-            )
-        elif len(column_mapping) == 2:
-            assert ColumnTypes.Context.value in column_mapping or ColumnTypes.Question.value in column_mapping, (
-                "Expected context or question to be in column_mapping",
-                column_mapping,
-            )
-        else:
-            raise ValueError(f"Expected 2 or 3 columns in column_mapping, got {len(column_mapping)}")
-
-        self.column_mapping = column_mapping
-
-        self.answer_only_loss_mask = answer_only_loss_mask
-        self.seq_length = seq_length
-        self.padding = padding
-        self.truncation = truncation
-        self.use_hf_chat_template = use_hf_chat_template
-
-    def __iter__(self) -> Iterator[Dict[str, List[int]]]:
-        for idx in range(len(self)):
-            yield self[idx]
-
-    def __len__(self) -> int:  # noqa: D401
-        """
-        Returns the length of the dataset.
-
-        Returns:
-            The length of the dataset.
-        """
-        return len(self.dataset)
-
-    def __getitem__(self, idx):  # noqa: D401
-        """
-        Returns the item at the given index.
-
-        Args:
-            idx: The index of the item to return.
-
-        Returns:
-            A dictionary with the mapped columns.
-        """
-        n = len(self.dataset)
-        for _ in range(n):
-            row = self.dataset[idx]
-            mapped = {dest: row[src] for dest, src in self.column_mapping.items() if src in row}
-            mapped = self._apply_tokenizer(mapped)
-            if any(label != -100 for label in mapped["labels"]):
-                assert _check_all_values_equal_length(mapped), "All values must be of the same length"
-                return mapped
-            idx = (idx + 1) % n
-        raise ValueError(
-            "All samples in the dataset produced labels that are entirely -100. "
-            "Check that the dataset and tokenizer configuration produce valid training targets."
-        )
-
-    def _apply_tokenizer(self, sample: Dict[str, str]) -> Dict[str, List[int]]:
-        """
-        Tokenize a mapped *sample* and compute auxiliary fields.
-
-        If the tokenizer is provided:
-        - If the tokenizer supports a chat template, the dataset will be tokenized in a conversation style.
-        - Otherwise, the dataset will be tokenized in a simple prompt-completion style.
-
-        Args:
-            sample: A dictionary with the mapped columns.
-
-        Returns:
-            A dictionary with the tokenized columns.
-        """
-        assert isinstance(sample, dict), "Expected sample to be a dictionary"
-        assert len(sample) >= 2, "Expected at least two columns"
-        context = sample.get(ColumnTypes.Context.value, None)
-        question = sample.get(ColumnTypes.Question.value, None)
-        answer = sample[ColumnTypes.Answer.value]
-
-        eos_token_id = getattr(self.tokenizer, "eos_token_id", 0)
-        pad_token_id = _add_pad_token(self.tokenizer) or eos_token_id
-
-        if self.use_hf_chat_template and _has_chat_template(self.tokenizer):
-            formatted_text = [
-                {"role": "system", "content": context or ""},
-                {"role": "user", "content": question or ""},
-                {"role": "assistant", "content": answer},
-            ]
-            return format_chat_template(
-                self.tokenizer,
-                formatted_text,
-                eos_token_id,
-                pad_token_id,
-                seq_length=self.seq_length,
-                padding=self.padding,
-                truncation=self.truncation,
-                answer_only_loss_mask=self.answer_only_loss_mask,
-            )
-        else:
-            prompt = " ".join(filter(lambda x: x is not None, (context, question, "")))
-            assert len(prompt) > 1, "Expected prompt to be non-empty"
-            return format_prompt_completion(
-                self.tokenizer,
-                prompt,
-                answer,
-                eos_token_id,
-                pad_token_id,
-                seq_length=self.seq_length,
-                padding=self.padding,
-                truncation=self.truncation,
-                answer_only_loss_mask=self.answer_only_loss_mask,
-            )
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/guides/llm/finetune.md
-```md
-# Supervised Fine-Tuning (SFT) and Parameter-Efficient Fine-Tuning (PEFT)
-
-## Introduction
-
-Pretrained language models are general-purpose: they know a lot about language but nothing about your particular domain, terminology, or task. Fine-tuning bridges that gap — you fine-tune the model on your own examples so it produces answers that are accurate and relevant for your use case, without the cost of training a model from scratch. The result is a model optimized for your data that you can evaluate, publish, and deploy. This guide walks you through that process end-to-end with NeMo AutoModel — from installation through training, evaluation, and deployment — using [Meta LLaMA 3.2 1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and the [SQuAD v1.1](https://huggingface.co/datasets/rajpurkar/squad) dataset as a running example.
-
-NeMo AutoModel supports two fine-tuning modes:
-
-- **Supervised Fine-Tuning (SFT)** updates all model parameters. Use SFT when you need maximum accuracy and have sufficient compute.
-- **Parameter-Efficient Fine-Tuning (PEFT)** using [LoRA](https://arxiv.org/abs/2106.09685) freezes the base model and trains small low-rank adapters. PEFT reduces trainable parameters to less than 1% of the original model, lowering memory and storage costs.
-
-### Workflow Overview
-
-```text
-┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐    ┌──────────────┐
-│ 1. Install   │--->│ 2. Configure │--->│  3. Train    │--->│ 4. Inference │--->│ 5. Evaluate  │--->│ 6. Publish   │--->│  7. Deploy   │
-│              │    │              │    │              │    │              │    │              │    │  (optional)  │    │  (optional)  │
-│ pip install  │    │ YAML config  │    │ automodel CLI│    │ HF generate  │    │ Val loss +   │    │ HF Hub       │    │ vLLM serving │
-│ or Docker    │    │ Choose SFT   │    │ or torchrun  │    │ API          │    │ lm-eval-     │    │ upload       │    │              │
-│              │    │ or PEFT      │    │              │    │              │    │ harness      │    │              │    │              │
-└──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘    └──────────────┘
-```
-
-| Step | Section | SFT | PEFT |
-|------|---------|-----|------|
-| **1. Install** | [Install NeMo AutoModel](#install-nemo-automodel) | Same | Same |
-| **2. Configure** | [Configure Your Training Recipe](#configure-your-training-recipe) | YAML without `peft:` section | YAML with `peft:` section |
-| **3. Train** | [Fine-Tune the Model](#fine-tune-the-model) | Same command for both modes | Same command for both modes |
-| **4. Inference** | [Run Inference](#run-inference) | Load consolidated checkpoint directly | Load base model + adapter |
-| **5. Evaluate** | [Evaluate the Fine-Tuned Model](#evaluate-the-fine-tuned-model) | Validation loss during training; lm-eval-harness post-training | Same |
-| **6. Publish** | [Publish to HF Hub](#publish-to-the-hugging-face-hub) | Upload `model/consolidated/` | Upload `model/` (adapter only) |
-| **7. Deploy** | [Deploy with vLLM](#deploy-with-vllm) | `vllm.LLM(model=...)` | `vLLMHFExporter` with `--lora-model` |
-
-## Install NeMo AutoModel
-
-```bash
-pip3 install nemo-automodel
-```
-
-Alternatively, if you run into dependency or driver issues, use the pre-built Docker container:
-
-```bash
-docker pull nvcr.io/nvidia/nemo-automodel:26.02.00
-docker run --gpus all -it --rm --shm-size=8g -v $(pwd)/checkpoints:/tmp/checkpoints/ nvcr.io/nvidia/nemo-automodel:26.02.00
-```
-
-:::{important}
-Docker containers are ephemeral — files written inside the container are lost when it stops. The `-v` flag in the `docker run` command above bind-mounts a local `checkpoints/` directory into the container so that saved checkpoints persist across runs. For more details, see [Saving Checkpoints When Using Docker](../checkpointing.md#saving-checkpoints-when-using-docker).
-:::
-
-For the full set of installation methods, see the [installation guide](../installation.md).
-
-## Configure Your Training Recipe
-
-
-Training is configured through a [YAML](https://en.wikipedia.org/wiki/YAML) config file with three required sections — **model**, **dataset**, and **step_scheduler** — plus an optional **peft** section. The sections below walk through each one. For the complete copy-pastable file, see [Full Config YAML](#full-config-yaml).
-
-Under the hood, both SFT and PEFT are executed by a **recipe**: a self-contained Python class that wires together model loading, dataset preparation, training, checkpointing, and logging. The fine-tuning recipe is [`TrainFinetuneRecipeForNextTokenPrediction`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py). The config file tells the recipe *what* to build; the recipe decides *how* to build it.
-
-:::{dropdown} How the Config System Works
-
-NeMo AutoModel configs use a convention borrowed from [Hydra](https://hydra.cc/): the special **`_target_`** key tells the framework *which* Python class or function to call, and **every other key** in the same YAML block is passed as a keyword argument to that call. For example:
-
-```yaml
-optimizer:
-  _target_: torch.optim.Adam
-  lr: 1.0e-5
-  weight_decay: 0
-```
-
-is equivalent to writing this Python code:
-
-```python
-from torch.optim import Adam
-
-optimizer = Adam(lr=1.0e-5, weight_decay=0)
-```
-
-The `_target_` value is a **dotted Python import path**: the same string you would use in an `import` statement. The framework resolves it at runtime by importing the module and looking up the attribute. This means you can point `_target_` at any class constructor or factory function, and the remaining keys become its arguments.
-
-:::{tip}
-To discover which parameters a section accepts, look up the Python signature of its `_target_`. For instance, `torch.optim.Adam` accepts `lr`, `betas`, `eps`, and `weight_decay` — those are the keys you can set in the YAML.
-:::
-
-**From YAML to running code.** Here is the path a config takes through the framework:
-
-```text
-finetune_config.yaml
-        │
-        ▼
-  ┌──────────────┐     load_yaml_config() parses the file into
-  │  ConfigNode  │◄─── a tree of ConfigNode objects, one per
-  └──────┬───────┘     YAML section.
-         │
-         ▼
-  ┌──────────────┐     The recipe's setup() method reads
-  │   Recipe     │◄─── each section from the ConfigNode tree
-  │   setup()    │     and passes it to the matching builder.
-  └──────┬───────┘
-         │
-    ┌────┴─────────────────────────────────┐
-    ▼            ▼            ▼            ▼
-build_model  build_optimizer build_dataloader build_loss_fn ...
-    │            │            │            │
-    ▼            ▼            ▼            ▼
-cfg.model     cfg.optimizer cfg.dataset   cfg.loss_fn
- .instantiate() .instantiate() .instantiate() .instantiate()
-    │            │            │            │
-    ▼            ▼            ▼            ▼
- Resolves      Resolves     Resolves     Resolves
- _target_,     _target_,    _target_,    _target_,
- calls it      calls it     calls it     calls it
- with kwargs   with kwargs  with kwargs  with kwargs
-```
-
-Each builder function calls **`.instantiate()`** on its config section. `.instantiate()` does two things:
-
-1. **Resolves `_target_`** — imports the Python path and obtains the callable (class or function).
-2. **Calls it** — passes every other key in the section as a keyword argument.
-
-Nested `_target_` blocks (like `collate_fn` inside `dataloader`) are recursively instantiated the same way.
-
-**The `recipe` key.** Every config file includes a top-level `recipe` key that tells the CLI *which recipe class* to run. You can write it as a **short name** or as a **fully-qualified Python path** — both resolve to the same class:
-
-```yaml
-# Short name (the CLI looks up the class automatically)
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-# Fully-qualified path (used as-is)
-recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-```
-
-The short name form is a convenience — the CLI scans all recipe modules under `nemo_automodel.recipes` and matches the bare class name. If you invoke the recipe script directly with `torchrun` instead of the `automodel` CLI, the `recipe` key is not required because the script itself *is* the recipe.
-
-**Not every section uses `_target_`.** Some sections like `step_scheduler`, `distributed`, and `checkpoint` are plain key-value groups consumed directly by the recipe — they control training schedule, parallelism strategy, and checkpoint behavior without instantiating a Python object.
-:::
-
-### Model
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-```
-
-| Key | Role |
-|-----|------|
-| `_target_` | Points to [`NeMoAutoModelForCausalLM.from_pretrained`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/_transformers/auto_model.py) — a factory method that downloads (or loads from cache) a pretrained Hugging Face model and wraps it with NeMo distributed-training support. |
-| `pretrained_model_name_or_path` | A keyword argument to `from_pretrained`. Any argument that [`from_pretrained`](https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.from_pretrained) accepts can be added here (e.g. `cache_dir`, `torch_dtype`). |
-
-This guide uses **Meta Llama 3.2 1B** as a running example. Replace `pretrained_model_name_or_path` with any supported [Hugging Face model ID](https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/model-coverage/llm.md).
-
-:::{dropdown} About Llama 3.2 1B
-Llama is a family of decoder-only transformer models developed by Meta. The 1B variant is a compact model suitable for research and edge deployment, featuring RoPE positional embeddings, grouped-query attention (GQA), and SwiGLU activations.
-:::
-
-:::{dropdown} Accessing gated models
-Some Hugging Face models are **gated**. If the model page shows a "Request access" button:
-
-1. Log in with your Hugging Face account and accept the license.
-2. Ensure the token you use (from `huggingface-cli login` or `HF_TOKEN`) belongs to the approved account.
-
-Pulling a gated model without an authorized token triggers a 403 error.
-:::
-
-### Dataset
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad  # HF-Hub ID used to pull the dataset
-  split: train
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-```
-
-| Key | Role |
-|-----|------|
-| `_target_` | Points to [`make_squad_dataset`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py) — a factory function that downloads the SQuAD dataset, tokenizes it, and returns a `torch.utils.data.Dataset`. To use a different dataset, change `_target_` to a different factory function (see [Integrate Your Own Text Dataset](dataset.md)). |
-| `dataset_name`, `split` | Keyword arguments passed to `make_squad_dataset`. Each dataset factory defines its own parameters — check the function signature to see what's available. |
-
-This guide uses **SQuAD v1.1** as a running example. Swap the dataset by changing `_target_` and the dataset arguments — see [Integrate Your Own Text Dataset](dataset.md) and [Dataset Overview](../dataset-overview.md).
-
-:::{dropdown} About SQuAD v1.1
-The Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset where each example consists of a Wikipedia passage, a question, and a span answer. SQuAD v1.1 guarantees all questions are answerable from the context, making it suitable for straightforward fine-tuning.
-
-Example:
-```json
-{
-    "context": "Architecturally, the school has a Catholic character. ...",
-    "question": "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?",
-    "answers": { "text": ["Saint Bernadette Soubirous"], "answer_start": [515] }
-}
-```
-:::
-
-### PEFT (Optional)
-
-```yaml
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*.proj"  # glob pattern matching linear layer FQNs
-  dim: 8                    # low-rank dimension of the adapters
-  alpha: 32                 # scaling factor for learned weights
-```
-
-| Key | Role |
-|-----|------|
-| `_target_` | Points to [`PeftConfig`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/_peft/lora.py) — a dataclass that describes which layers to adapt and how. Unlike the model and dataset sections, this instantiation produces a *config object*, not the adapter itself. The recipe passes the resulting `PeftConfig` into `build_model`, which applies LoRA adapters to the model. |
-| `target_modules` | A glob pattern matched against fully-qualified layer names (e.g. `"*.proj"` matches every layer whose name ends in `proj`). |
-| `dim` | The low-rank dimension *r* — controls adapter capacity. Larger values learn more but use more memory. |
-| `alpha` | Scaling factor applied to the adapter output (`alpha / dim`). Higher values give adapters more influence during training. |
-
-Including a `peft:` section enables LoRA fine-tuning. Remove it entirely to run SFT instead — see [Switching Between SFT and PEFT](#switching-between-sft-and-peft).
-
-#### QLoRA (Quantized Low-Rank Adaptation)
-
-If GPU memory is a constraint, [QLoRA](https://arxiv.org/abs/2305.14314) combines LoRA with 4-bit NormalFloat (NF4) quantization to reduce memory usage by up to 75% compared to full-parameter SFT in 16-bit precision, while maintaining comparable quality to standard LoRA.
-
-To enable QLoRA, add a `quantization:` section alongside the `peft:` section in your config. Note two differences from the standard PEFT config above: `target_modules` uses the broader `"*_proj"` pattern to apply LoRA to all projection layers (wider coverage compensates for precision loss from 4-bit weights), and `dim` is increased from 8 to 16 for additional adapter capacity.
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*_proj"  # broader glob than "*.proj" to cover all projection layers
-  dim: 16                   # LoRA rank (higher than default to offset quantization)
-  alpha: 32                # scaling factor
-  dropout: 0.1             # LoRA dropout rate
-
-quantization:
-  load_in_4bit: True                   # enable 4-bit quantization
-  load_in_8bit: False                  # use 4-bit, not 8-bit
-  bnb_4bit_compute_dtype: bfloat16     # compute dtype
-  bnb_4bit_use_double_quant: True      # double quantization for extra savings
-  bnb_4bit_quant_type: nf4             # NormalFloat quantization type
-  bnb_4bit_quant_storage: bfloat16     # storage dtype for quantized weights
-```
-
-### Training Schedule
-
-```yaml
-step_scheduler:
-  num_epochs: 1     # Will train over the dataset once.
-```
-
-Unlike the sections above, `step_scheduler` has **no `_target_`** — it is not instantiated into a Python object. Instead, the recipe reads its keys directly to control the training loop (how many epochs to run, when to checkpoint, when to validate). This is typical of sections that configure *behavior* rather than *components*.
-
-All other settings (distributed strategy, optimizer, checkpointing, logging) use sensible defaults. See the [Full Configuration Reference](#full-configuration-reference) to customize them.
-
-### Full Config YAML
-
-:::{dropdown} finetune_config.yaml (click to expand)
-Save as `finetune_config.yaml`. This config runs PEFT (LoRA). To run SFT instead, remove the `peft:` section. For production-ready examples, see the hosted configs: [Llama 3.2 1B SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) and [Llama 3.2 1B PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml).
-
-```yaml
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*.proj"
-  dim: 8
-  alpha: 32
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-
-step_scheduler:
-  num_epochs: 1
-```
-:::
-
-## Fine-Tune the Model
-
-You can run the recipe using the AutoModel CLI or directly with `torchrun` (advanced).
-
-```bash
-automodel --nproc-per-node=8 finetune_config.yaml
-```
-
-The `--nproc-per-node=8` flag specifies the number of GPUs per node. Adjust to your case (for a single GPU, omit the `--nproc-per-node` option).
-
-### Invoke the Recipe Script Directly (advanced)
-
-Alternatively, you can invoke the recipe [script](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py) directly using [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html), as shown below.
-
-``` bash
-torchrun --nproc-per-node=8 nemo_automodel/recipes/llm/train_ft.py -c finetune_config.yaml
-```
-
-### Sample Output
-Running the recipe using either the `automodel` app or by directly invoking the recipe script should produce
-the following log:
-```
-$ automodel finetune_config.yaml
-INFO:nemo_automodel.cli.app:Config: finetune_config.yaml
-INFO:nemo_automodel.cli.app:Recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-INFO:nemo_automodel.cli.app:Launching job interactively (local)
-cfg-path: finetune_config.yaml
-INFO:root:step 4 | epoch 0 | loss 1.5514 | grad_norm 102.0000 | mem: 11.66 GiB | tps 6924.50
-INFO:root:step 8 | epoch 0 | loss 0.7913 | grad_norm 46.2500 | mem: 14.58 GiB | tps 9328.79
-Saving checkpoint to checkpoints/epoch_0_step_10
-INFO:root:step 12 | epoch 0 | loss 0.4358 | grad_norm 23.8750 | mem: 15.48 GiB | tps 9068.99
-INFO:root:step 16 | epoch 0 | loss 0.2057 | grad_norm 12.9375 | mem: 16.47 GiB | tps 9148.28
-INFO:root:step 20 | epoch 0 | loss 0.2557 | grad_norm 13.4375 | mem: 12.35 GiB | tps 9196.97
-Saving checkpoint to checkpoints/epoch_0_step_20
-INFO:root:[val] step 20 | epoch 0 | loss 0.2469
-```
-
-Each log line reports the current loss, gradient norm, peak GPU memory, and tokens per second (TPS). Small fluctuations between steps (e.g., 0.2057 to 0.2557 above) are normal — look at the overall downward trend rather than individual values.
-
-### Checkpoint Contents
-
-Checkpoints are saved in native Hugging Face format, so no conversion is required — they work directly with Transformers, PEFT, vLLM, lm-eval-harness, and other tools in the Hugging Face ecosystem. SFT and PEFT produce different checkpoint layouts. **SFT checkpoints** contain the full model weights at `model/consolidated/` — a single, self-contained Hugging Face model directory created by gathering distributed shards into one location — and can be loaded directly. **PEFT checkpoints** contain only the adapter weights (~MBs instead of GBs) — at inference time you must load the original base model and apply the adapter on top. This distinction affects every downstream step (inference, publishing, deployment).
-
-:::{dropdown} Checkpoint directory structure
-**SFT checkpoint:**
-```bash
-$ tree checkpoints/epoch_0_step_10/
-checkpoints/epoch_0_step_10/
-├── config.yaml
-├── dataloader.pt
-├── model
-│   ├── consolidated
-│   │   ├── config.json
-│   │   ├── model-00001-of-00001.safetensors
-│   │   ├── model.safetensors.index.json
-│   │   ├── special_tokens_map.json
-│   │   ├── tokenizer.json
-│   │   ├── tokenizer_config.json
-│   │   └── generation_config.json
-│   ├── shard-00001-model-00001-of-00001.safetensors
-│   └── shard-00002-model-00001-of-00001.safetensors
-├── optim
-│   ├── __0_0.distcp
-│   └── __1_0.distcp
-├── rng.pt
-└── step_scheduler.pt
-
-4 directories, 11 files
-```
-
-**PEFT checkpoint:**
-```bash
-$ tree checkpoints/epoch_0_step_10/
-checkpoints/epoch_0_step_10/
-├── dataloader.pt
-├── config.yaml
-├── model
-│   ├── adapter_config.json
-│   ├── adapter_model.safetensors
-│   └── automodel_peft_config.json
-├── optim
-│   ├── __0_0.distcp
-│   └── __1_0.distcp
-├── rng.pt
-└── step_scheduler.pt
-
-2 directories, 8 files
-```
-:::
-
-## Run Inference
-
-Inference uses the Hugging Face `generate` API. Because SFT checkpoints are self-contained while PEFT checkpoints store only adapter weights (see [Checkpoint Contents](#checkpoint-contents)), the loading procedure differs between the two modes.
-
-### SFT Inference
-
-The SFT checkpoint at `model/consolidated/` is a complete Hugging Face model and can be loaded directly:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-ckpt_path = "checkpoints/epoch_0_step_10/model/consolidated"
-tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
-model = AutoModelForCausalLM.from_pretrained(ckpt_path)
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-
-prompt = (
-    "Context: Architecturally, the school has a Catholic character. "
-    "Atop the Main Building's gold dome is a golden statue of the Virgin Mary. "
-    "Immediately in front of the Main Building and facing it, is a copper statue of Christ "
-    "with arms upraised with the legend 'Venite Ad Me Omnes'.\n\n"
-    "Question: What is atop the Main Building?\n\n"
-    "Answer:"
-)
-inputs = tokenizer(prompt, return_tensors="pt").to(device)
-output = model.generate(**inputs, max_new_tokens=50)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-### PEFT Inference
-
-PEFT adapters must be loaded on top of the base model:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-
-base_model_name = "meta-llama/Llama-3.2-1B"
-tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-model = AutoModelForCausalLM.from_pretrained(base_model_name)
-
-adapter_path = "checkpoints/epoch_0_step_10/model/"
-model = PeftModel.from_pretrained(model, adapter_path)
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-
-prompt = (
-    "Context: Architecturally, the school has a Catholic character. "
-    "Atop the Main Building's gold dome is a golden statue of the Virgin Mary. "
-    "Immediately in front of the Main Building and facing it, is a copper statue of Christ "
-    "with arms upraised with the legend 'Venite Ad Me Omnes'.\n\n"
-    "Question: What is atop the Main Building?\n\n"
-    "Answer:"
-)
-inputs = tokenizer(prompt, return_tensors="pt").to(device)
-output = model.generate(**inputs, max_new_tokens=50)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Evaluate the Fine-Tuned Model
-
-### During Training: Validation Loss
-
-The recipe automatically computes validation loss at the interval set by `val_every_steps`. Look for `[val]` lines in the training log:
-
-```text
-INFO:root:[val] step 20 | epoch 0 | loss 0.2469
-```
-
-A decreasing validation loss across checkpoints indicates the model is learning. If validation loss plateaus or increases while training loss continues to drop, the model may be overfitting — consider stopping earlier or reducing the learning rate.
-
-### After Training: lm-eval-harness
-
-For task-specific benchmarks (e.g., MMLU, GSM8k, HellaSwag accuracy), use [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) with the fine-tuned checkpoint:
-
-```bash
-pip install lm-eval
-
-# SFT checkpoint (using vLLM backend for faster evaluation)
-lm_eval --model vllm \
-  --model_args pretrained=checkpoints/epoch_0_step_20/model/consolidated/ \
-  --tasks hellaswag \
-  --batch_size auto
-
-# PEFT adapter (using Hugging Face backend with built-in PEFT support)
-lm_eval --model hf \
-  --model_args pretrained=meta-llama/Llama-3.2-1B,peft=checkpoints/epoch_0_step_20/model/ \
-  --tasks hellaswag \
-  --batch_size auto
-```
-
-:::{tip}
-The SFT example uses the `vllm` backend for faster evaluation (requires `pip install vllm`; see [Deploy with vLLM](#deploy-with-vllm) for setup details). The PEFT example uses the `hf` backend with lm-eval's built-in PEFT support to load the adapter on top of the base model.
-:::
-
-:::{tip}
-Run lm-eval-harness on the base model *before* fine-tuning to establish a baseline, then compare against the fine-tuned checkpoint.
-:::
-
-## Publish to the Hugging Face Hub
-
-Fine-tuned checkpoints and PEFT adapters are stored in Hugging Face-native format and can be uploaded directly to the Hub.
-
-1. Install the Hugging Face Hub library (if not already installed):
-
-```bash
-pip3 install huggingface_hub
-```
-
-2. Log in to Hugging Face:
-
-```bash
-huggingface-cli login
-```
-
-3. Upload:
-
-**SFT checkpoint:**
-```python
-from huggingface_hub import HfApi
-
-api = HfApi()
-api.upload_folder(
-    folder_path="checkpoints/epoch_0_step_10/model/consolidated",
-    repo_id="your-username/llama3.2_1b-finetuned-squad",
-    repo_type="model",
-)
-```
-
-**PEFT adapter:**
-```python
-from huggingface_hub import HfApi
-
-api = HfApi()
-api.upload_folder(
-    folder_path="checkpoints/epoch_0_step_10/model",
-    repo_id="your-username/llama3.2_1b-lora-squad",
-    repo_type="model",
-)
-```
-
-Once uploaded, load the checkpoint or adapter directly from the Hub:
-
-**SFT:**
-```python
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("your-username/llama3.2_1b-finetuned-squad")
-```
-
-**PEFT:**
-```python
-from transformers import AutoModelForCausalLM
-from peft import PeftModel
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
-model = PeftModel.from_pretrained(model, "your-username/llama3.2_1b-lora-squad")
-```
-
-## Deploy with vLLM
-
-[vLLM](https://github.com/vllm-project/vllm) is an efficient inference engine for production deployment of LLMs.
-
-:::{note}
-Make sure vLLM is installed (`pip install vllm`, or use an environment that includes it).
-:::
-
-### SFT Checkpoint with vLLM
-
-```python
-from vllm import LLM, SamplingParams
-
-llm = LLM(model="checkpoints/epoch_0_step_10/model/consolidated/", model_impl="transformers")
-params = SamplingParams(max_tokens=20)
-outputs = llm.generate("Toronto is a city in Canada.", sampling_params=params)
-print(f"Generated text: {outputs[0].outputs[0].text}")
-```
-```text
->>> Generated text:  It is the capital of Ontario. Toronto is a global hub for cultural tourism. The City of Toronto
-```
-
-### PEFT Adapter with vLLM
-
-PEFT adapter serving uses the `vLLMHFExporter` class, which is provided by the `nemo` package — a separate dependency from `nemo-automodel`.
-
-:::{important}
-Install both packages before proceeding:
-```bash
-pip install nemo vllm
-```
-:::
-
-```python
-from nemo.export.vllm_hf_exporter import vLLMHFExporter
-
-if __name__ == '__main__':
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', required=True, type=str, help="Local path of the base model")
-    parser.add_argument('--lora-model', required=True, type=str, help="Local path of the LoRA adapter")
-    args = parser.parse_args()
-
-    lora_model_name = "lora_model"
-
-    exporter = vLLMHFExporter()
-    exporter.export(model=args.model, enable_lora=True)
-    exporter.add_lora_models(lora_model_name=lora_model_name, lora_model=args.lora_model)
-
-    print("vLLM Output: ", exporter.forward(input_texts=["How are you doing?"], lora_model_name=lora_model_name))
-```
-
-## Full Configuration Reference
-
-This section documents all available config fields for the fine-tuning recipe. For the quick-start config, see [Configure Your Training Recipe](#configure-your-training-recipe).
-
-### Switching Between SFT and PEFT
-
-The `peft:` section controls which mode runs:
-
-| Mode | What to do in the YAML |
-|------|----------------------|
-| **PEFT (LoRA)** | Include the `peft:` section as shown below. |
-| **SFT (full-parameter)** | Remove/comment the `peft:` section entirely. |
-
-All other config sections remain the same for both modes.
-
-### Full Configuration
-
-:::{dropdown} Full Config
-:open:
-```yaml
-# Recipe
-# Selects which recipe class runs the training loop.
-# Use a short name (auto-discovered) or a fully-qualified Python path:
-#   recipe: nemo_automodel.recipes.llm.train_ft.TrainFinetuneRecipeForNextTokenPrediction
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-# Training Schedule
-# Controls epoch count, batch sizes, and how often to checkpoint / validate.
-# No _target_ — these are plain values read directly by the recipe.
-step_scheduler:
-  grad_acc_steps: 4       # number of micro-batches accumulated before each optimizer
-                          # step. Effective batch = grad_acc_steps × batch_size.
-  ckpt_every_steps: 10    # save a checkpoint every N gradient steps
-  val_every_steps: 10     # run the validation loop every N gradient steps
-  num_epochs: 1           # how many full passes over the training dataset
-
-# Process Group
-# Initializes the PyTorch distributed process group.
-# No _target_ — consumed directly by the recipe.
-# You normally would not need to tune this.
-dist_env:
-  backend: nccl           # communication backend: "nccl" (GPU, recommended) or "gloo" (CPU)
-  timeout_minutes: 1      # timeout for collective operations; increase for large models
-                          # that take longer to initialize
-
-# Distributed Strategy
-# Determines how model weights, data, and compute are split across GPUs.
-# No _target_ — consumed directly by the recipe.
-# See "Distributed Training: TP, PP, CP, and EP" in Advanced Topics for details.
-distributed:
-  strategy: fsdp2         # parallelism strategy: "fsdp2" (recommended), "megatron_fsdp",
-                          # or "ddp". FSDP2 shards parameters and optimizer states across
-                          # the data-parallel group.
-  dp_size: null           # data-parallel group size. null = auto-detect from
-                          # world_size ÷ (tp_size × cp_size × pp_size).
-  tp_size: 1              # tensor-parallel size: splits weight matrices across GPUs.
-                          # Set to 2, 4, or 8 if the model doesn't fit on one GPU.
-                          # Should divide evenly into the number of attention heads.
-  cp_size: 1              # context-parallel size: splits the input sequence across GPUs.
-                          # Increase for very long contexts (e.g. 32k+ tokens).
-  sequence_parallel: false # when true, extends TP to also shard activations along
-                          # the sequence dimension for additional memory savings
-
-# Random Number Generator
-# _target_ → StatefulRNG: a checkpointable RNG that ensures identical sequences
-# across training restarts. Seed and ranked are kwargs to StatefulRNG().
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111              # global random seed for reproducibility
-  ranked: true            # when true, each GPU rank gets a unique RNG stream derived
-                          # from the seed, so data shuffling differs per GPU
-
-# Model
-# _target_ → NeMoAutoModelForCausalLM.from_pretrained: downloads (or loads from
-# cache) a pretrained HuggingFace model and wraps it with NeMo distributed-training
-# support. Any from_pretrained kwarg is accepted (cache_dir, torch_dtype, etc.).
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-# PEFT (remove / comment this entire section for full-parameter SFT)
-# _target_ → PeftConfig: a dataclass describing which layers get LoRA adapters.
-# The recipe passes this config into build_model(), which attaches adapters
-# to the matching layers.
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  target_modules: "*.proj" # glob pattern matched against fully-qualified layer names;
-                           # "*.proj" matches every layer ending in "proj"
-  dim: 8                   # low-rank dimension r — controls adapter capacity.
-                           # Larger values are more expressive but use more memory.
-  alpha: 32                # LoRA scaling factor: adapter output is scaled by alpha/dim.
-                           # Higher values give adapters more influence during training.
-  use_triton: True         # use an optimized Triton kernel for LoRA forward/backward
-                           # (requires the triton package)
-
-# Checkpointing
-# No _target_ — plain key-value group consumed by the recipe.
-checkpoint:
-  enabled: true            # set to false to skip saving checkpoints entirely
-  checkpoint_dir: checkpoints/  # output directory. Docker users: bind-mount this path
-                                # (e.g. -v $(pwd)/checkpoints:/workspace/checkpoints)
-                                # to persist checkpoints across container restarts.
-  model_save_format: safetensors  # "safetensors" (recommended, faster and safer) or
-                                  # "torch_save" (legacy pickle-based format)
-  save_consolidated: True  # when true, writes a single HuggingFace-compatible checkpoint
-                           # to model/consolidated/ that can be loaded directly by
-                           # Transformers, vLLM, etc. Requires safetensors format.
-
-# Training Dataset
-# _target_ → make_squad_dataset: a factory function that downloads the SQuAD
-# dataset, tokenizes it, and returns a torch Dataset. To use a different dataset,
-# change _target_ to another factory function (see the dataset guide).
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad  # HuggingFace Hub dataset ID
-  split: train                   # which split to use (train, validation, test)
-
-# Validation Dataset
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-  limit_dataset_samples: 64  # cap validation set to 64 samples for faster eval loops;
-                             # remove this line to use the full validation set
-
-# Training Dataloader
-# _target_ → StatefulDataLoader: a checkpointable DataLoader from torchdata that
-# saves and restores iteration state across training restarts, so resumed runs
-# don't re-process already-seen batches.
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-                               # function that pads and batches individual samples
-                               # into tensors; can be swapped for custom collation
-  batch_size: 8                # samples per micro-batch per GPU
-  shuffle: true                # whether to shuffle the dataset each epoch
-
-# Validation Dataloader
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-  batch_size: 8
-
-# Loss Function
-# _target_ → MaskedCrossEntropy: standard cross-entropy loss that automatically
-# ignores padding tokens so they don't affect the gradient.
-# Other available loss functions (swap _target_ to use):
-#   - nemo_automodel.components.loss.chunked_ce.ChunkedCrossEntropy
-#       Computes CE in chunks along the sequence dimension to reduce peak memory.
-#       Useful for very long sequences. Accepts chunk_len (default 32).
-#   - nemo_automodel.components.loss.linear_ce.FusedLinearCrossEntropy
-#       Fuses the final linear projection (lm_head) with the CE computation,
-#       avoiding the full logit tensor. Significant **memory savings** for large vocabs.
-#   - nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy
-#       TE-based parallel CE with a Triton kernel. Designed for tensor-parallel
-#       setups where logits are sharded across TP ranks.
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-# Optimizer
-# _target_ → torch.optim.Adam: any torch.optim class can be used here (e.g.
-# AdamW, SGD). All remaining keys become kwargs to the constructor.
-optimizer:
-  _target_: torch.optim.Adam
-  lr: 1.0e-5               # learning rate — the most important hyperparameter to tune
-  betas: [0.9, 0.999]      # Adam momentum coefficients (β₁ for mean, β₂ for variance)
-  eps: 1e-8                 # small constant added to the denominator for numerical stability
-  weight_decay: 0           # L2 regularization strength (0 = no regularization)
-
-# Logging (optional)
-# Uncomment to enable Weights & Biases experiment tracking.
-# wandb:
-#   project: <your_wandb_project>    # W&B project name
-#   entity: <your_wandb_entity>      # W&B team or username
-#   name: <your_wandb_exp_name>      # display name for this run
-#   save_dir: <your_wandb_save_dir>  # local directory for W&B artifacts
-```
-:::
-
-### Config Field Reference
-
-| Section | Required? | What to change |
-|---------|-----------|----------------|
-| `model` | Yes | Set `pretrained_model_name_or_path` to your Hugging Face model ID. Source: [`auto_model.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/_transformers/auto_model.py). |
-| `peft` | PEFT only | Remove entirely for SFT. Adjust `dim` and `alpha` to tune adapter capacity. `use_triton: True` enables an optimized LoRA kernel (requires the `triton` package). For reduced memory usage, see [QLoRA](#qlora-quantized-low-rank-adaptation). Source: [`lora.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/_peft/lora.py). |
-| `dataset` | Yes | Change `_target_`, `dataset_name`, and `split` for your data. Source: [`squad.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py). |
-| `dataloader` | Optional | Adjust `batch_size` and `shuffle`. Uses [`StatefulDataLoader`](https://meta-pytorch.org/data/main/torchdata.stateful_dataloader.html) for checkpointable iteration. Collation: [`utils.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/utils.py). |
-| `loss_fn` | Optional | Default is [`MaskedCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/masked_ce.py). Alternatives: [`ChunkedCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/chunked_ce.py) (long sequences), [`FusedLinearCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/linear_ce.py) (large vocabs), [`TEParallelCrossEntropy`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/loss/te_parallel_ce.py) (tensor-parallel). |
-| `rng` | Optional | Controls reproducibility. Source: [`rng.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/training/rng.py). |
-| `step_scheduler` | Yes | `grad_acc_steps` sets how many micro-batches accumulate per gradient step. `ckpt_every_steps` and `val_every_steps` are counted in gradient steps. |
-| `distributed` | Yes | `dp_size: null` means auto-detect from world size. Adjust `tp_size` for tensor parallelism across GPUs. |
-| `checkpoint` | Recommended | Set `checkpoint_dir` to a persistent path, especially in Docker. |
-| `optimizer` | Optional | Defaults are reasonable. Any `torch.optim` class can be substituted via `_target_`. |
-| `wandb` | Optional | Uncomment and configure to enable Weights & Biases logging. |
-
-For the fine-tuning recipe itself, see [`train_ft.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py). For more example configs, browse [`examples/llm_finetune/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune).
-
-## Distributed Training: TP, PP, CP, and EP
-
-The `distributed:` section controls how the model and data are split across GPUs. NeMo AutoModel supports four parallelism dimensions, each of which slices the workload differently:
-
-| Dimension | Key | What it shards | When to use |
-|-----------|-----|---------------|-------------|
-| **Data Parallel (DP)** | `dp_size` | Replicates the model on each group of GPUs; each replica trains on a different data batch. | Default. Scales batch size linearly with GPU count. |
-| **Tensor Parallel (TP)** | `tp_size` | Splits individual weight matrices (attention, MLP) across GPUs within a node. | Model is too large to fit on a single GPU, or you want to reduce per-GPU memory at the cost of extra communication. |
-| **Pipeline Parallel (PP)** | `pp_size` | Assigns different *layers* (stages) to different GPUs and pipelines micro-batches through them. | Very deep models that don't fit even with TP, or multi-node training where TP's all-reduce is too expensive across nodes. |
-| **Context Parallel (CP)** | `cp_size` | Splits the input *sequence* across GPUs so each GPU processes a portion of the context. | Very long sequences that exceed single-GPU memory. |
-| **Expert Parallel (EP)** | `ep_size` | Distributes MoE experts across GPUs so each GPU holds a subset of experts. | Mixture-of-Experts models only. |
-
-These dimensions compose with each other. The relationship between them and total GPU count is:
-
-```text
-world_size = pp_size × dp_size × cp_size × tp_size
-```
-
-When `dp_size` is set to `null` (the default), it is inferred automatically:
-
-```text
-dp_size = world_size ÷ (tp_size × cp_size × pp_size)
-```
-
-EP does not appear in this formula — experts are distributed across the DP×CP rank groups, with the constraint that `(dp_size × cp_size)` must be divisible by `ep_size`.
-
-#### Data Parallel (default)
-
-Data parallelism is the default. With `strategy: fsdp2`, FSDP2 shards both model parameters and optimizer states across the DP group, so memory usage shrinks as you add GPUs:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null   # auto-detected from world_size ÷ (tp × cp × pp)
-  tp_size: 1
-  cp_size: 1
-```
-
-#### Tensor Parallelism
-
-TP splits weight matrices across GPUs within a single node. Set `tp_size` to the number of GPUs you want to shard over (typically 2, 4, or 8 — should divide evenly into the number of attention heads):
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 4
-  cp_size: 1
-  sequence_parallel: false   # set to true for additional memory savings
-```
-
-`sequence_parallel: true` extends TP to also shard activation memory along the sequence dimension, further reducing per-GPU memory at the cost of additional communication.
-
-#### Pipeline Parallelism
-
-PP assigns groups of layers to different GPUs and streams micro-batches through the stages. It requires an additional nested `pipeline:` section:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 4
-  pp_size: 4
-  cp_size: 1
-  activation_checkpointing: true
-
-  pipeline:
-    pp_schedule: interleaved1f1b  # pipeline schedule (1f1b or interleaved1f1b)
-    pp_microbatch_size: 1         # micro-batch size per pipeline step
-    layers_per_stage: 4           # how many layers each stage handles
-    scale_grads_in_schedule: false
-```
-
-| Key | Role |
-|-----|------|
-| `pp_schedule` | The micro-batch schedule. `1f1b` is simpler; `interleaved1f1b` overlaps compute and communication for better throughput. |
-| `pp_microbatch_size` | Number of samples per micro-batch fed into the pipeline. Must satisfy: `local_batch_size ÷ pp_microbatch_size ≥ pp_size`. |
-| `layers_per_stage` | How many transformer layers each pipeline stage contains. If omitted, the framework splits layers evenly across `pp_size` stages. |
-
-:::{note}
-PP requires the model to define a `_pp_plan` that tells the framework how to split layers into stages. All built-in models include this plan; custom models must add one.
-:::
-
-#### Context Parallelism
-
-CP splits the sequence across GPUs — useful for very long contexts that exceed single-GPU memory. Set `cp_size` to the desired split factor:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 1
-  cp_size: 2
-```
-
-:::{important}
-When `cp_size > 1`, fused RoPE is automatically disabled. Some models also require the Transformer Engine (TE) attention backend for CP with packed sequences — the framework will raise an error with instructions if this applies.
-:::
-
-#### Expert Parallelism (MoE models)
-
-EP distributes MoE experts across GPUs. Set `ep_size` to the number of GPUs that share the full set of experts:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  tp_size: 1
-  cp_size: 1
-  pp_size: 1
-  ep_size: 8
-  activation_checkpointing: true
-```
-
-EP only applies to Mixture-of-Experts models (e.g. Qwen3-MoE, Mixtral, DeepSeek-V3). For dense models, leave `ep_size` at `1` or omit it.
-
-#### Combining Multiple Dimensions
-
-You can combine TP, PP, CP, and EP in a single config. For example, a large MoE model on a multi-node cluster might use:
-
-```yaml
-distributed:
-  strategy: fsdp2
-  dp_size: null
-  tp_size: 1
-  cp_size: 2
-  pp_size: 1
-  ep_size: 4
-  activation_checkpointing: true
-```
-
-When choosing a combination, keep these rules in mind:
-
-- **`world_size` must divide evenly** into `pp_size × tp_size × cp_size` (the remainder becomes `dp_size`).
-- **`(dp_size × cp_size) % ep_size == 0`** — EP shares the DP×CP groups.
-- **TP within a node, PP across nodes** is the typical layout — TP requires fast NVLink bandwidth, while PP tolerates higher latency.
-- **Start simple.** Use DP-only first. Add TP if the model doesn't fit on one GPU. Add PP for very large models. Add CP for long sequences. Add EP only for MoE architectures.
-
-## Next Steps
-
-- [Integrate Your Own Text Dataset](dataset.md) — swap the SQuAD example for your own data.
-- [Recipes and End-to-End Examples](../overview.md) — browse the full set of recipes available in NeMo AutoModel. See also the [`examples/llm_finetune/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune) directory for ready-to-run configs.
-- [Dataset Overview](../dataset-overview.md) — see all supported dataset types across LLM, VLM, and retrieval tasks.
-- [Knowledge Distillation](knowledge-distillation.md) — distill a fine-tuned model into a smaller one.
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/guides/llm/dataset.md
-```md
-# Integrate Your Own Text Dataset
-
-This guide shows you how to integrate your own dataset into NeMo Automodel for training. You'll learn about two main dataset types: **completion datasets** for language modeling (like [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag)) and **instruction datasets** for question-answering tasks (like [SQuAD](https://huggingface.co/datasets/rajpurkar/squad)). We'll cover how to create custom datasets by implementing the required methods and preprocessing functions, and finally show you how to specify your own data logic using YAML configuration with file paths—allowing you to define custom dataset processing without modifying the main codebase.
-
-## Quick Start Summary
-| **Type**        |  **Use Case**    | **Example** | **Preprocessor**               | **Section**              |
-| --------------- | ------------------ | -------------- | --------------------------------- | --------------------------- |
-| ✍️ Completion   | Language modeling  | HellaSwag      | `SFTSingleTurnPreprocessor`       | [Jump](#completion-datasets)  |
-| 🗣️ Instruction  | Question answering | SQuAD          | `make_*` function                 | [Jump](#instruction-datasets) |
-
-## Types of Supported Datasets
-
-NeMo Automodel supports a variety of datasets, depending on the task.
-### Completion Datasets
-
-**Completion datasets** are single text sequences designed for language modeling where the model learns to predict the next token given a context. These datasets typically contain a context (prompt) and a target (completion) that the model should learn to generate.
-
-#### Example: HellaSwag
-
-The [HellaSwag](https://huggingface.co/datasets/rowan/hellaswag) dataset is a popular completion dataset used for commonsense reasoning. It contains situations with multiple-choice endings where the model must choose the most plausible continuation.
-
-**HellaSwag dataset structure:**
-- **Context (`ctx`)**: A situation or scenario description
-- **Endings**: Multiple possible completions (4 options)
-- **Label**: Index of the correct ending
-
-**Example:**
-```
-Context: "A man is sitting at a piano in a large room."
-Endings: [
-  "He starts playing a beautiful melody.",
-  "He eats a sandwich while sitting there.",
-  "He suddenly becomes invisible.",
-  "He transforms into a robot."
-]
-Label: 0  # First ending is correct
-```
-
-#### Preprocessing with SFTSingleTurnPreprocessor
-
-NeMo Automodel provides the `SFTSingleTurnPreprocessor` class to handle completion datasets. This processor:
-
-1. **Extracts context and target** using `get_context()` and `get_target()`.
-2. **Tokenizes and cleans** context and target separately.
-3. **Concatenates** them into one sequence.
-4. **Creates loss mask**: `-100` for context, target IDs for target.
-5. **Pads** sequences to equal length.
-
-
-#### Create Your Own Completion Dataset
-
-To adapt your dataset into this format, define a class like this:
-
-```python
-from datasets import load_dataset
-from nemo_automodel.components.datasets.utils import SFTSingleTurnPreprocessor
-
-class MyCompletionDataset:
-    def __init__(self, path_or_dataset, tokenizer, split="train"):
-        raw_datasets = load_dataset(path_or_dataset, split=split)
-        processor = SFTSingleTurnPreprocessor(tokenizer)
-        self.dataset = processor.process(raw_datasets, self)
-
-    def get_context(self, examples):
-        """Extract context/prompt from your dataset"""
-        return examples["context_field"]  # Replace with your context field
-
-    def get_target(self, examples):
-        """Extract target/completion from your dataset"""
-        return examples["target_field"]   # Replace with your target field
-
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    def __len__(self):
-        return len(self.dataset)
-```
-
-
-### Instruction Datasets
-
-**Instruction datasets** are question-answer pairs where the model learns to respond to specific instructions or questions. These datasets are structured as context-question pairs with corresponding answers, making them ideal for teaching models to follow instructions and provide accurate responses.
-
-#### Example: SQuAD
-
-The [SQuAD (Stanford Question Answering Dataset)](https://huggingface.co/datasets/rajpurkar/squad) is a popular instruction dataset for reading comprehension. It contains questions based on Wikipedia articles along with their answers.
-
-**SQuAD dataset structure:**
-- **Context**: A paragraph of text from Wikipedia
-- **Question**: A question about the context
-- **Answers**: The correct answer with its position in the context
-
-#### Create Your Own Instruction Dataset
-
-The [`squad.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/components/datasets/llm/squad.py) file contains the implementation for processing the SQuAD dataset into a format suitable for instruction tuning. It defines a dataset class and preprocessing functions that extract the context, question, and answer fields, concatenate them into a prompt-completion format, and apply tokenization, padding, and loss masking. This serves as a template for building custom instruction datasets by following a similar structure and adapting the extraction logic to your dataset's schema.
-
-Based on the SQuAD implementation in `squad.py`, you can create your own instruction dataset using the `make_squad_dataset` pattern:
-
-```python
-from datasets import load_dataset
-
-def make_my_instruction_dataset(
-    tokenizer,
-    seq_length=None,
-    limit_dataset_samples=None,
-    split="train",
-    dataset_name="your-dataset-name",
-):
-    if limit_dataset_samples:
-        split = f"{split}[:{limit_dataset_samples}]"
-
-    dataset = load_dataset(dataset_name, split=split)
-
-    return dataset.map(
-        your_own_fmt_fn,  # Your formatting function
-        batched=False,
-        remove_columns=dataset.column_names,
-    )
-```
-
-## YAML-based Custom Dataset Configuration
-
-NeMo Automodel supports YAML-based dataset specification using the _target_ key. This lets you reference dataset-building classes or functions using either:
-
-- 1. Python Dotted Path
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-
-- 2. File Path + Function Name
-
-```
-<file-path>:<function-name>
-```
-
-Where:
-- `<file-path>`: The absolute path to a Python file containing your dataset function
-- `<function-name>`: The name of the function to call from that file
-
-```yaml
-dataset:
-  _target_: /path/to/your/custom_dataset.py:build_my_dataset
-  num_blocks: 111
-```
-This will call `build_my_dataset()` from the specified file with the other keys (e.g., num_blocks) as arguments. This approach allows you to integrate custom datasets via config alone—no need to alter the codebase or package structure.
-
-
-## Packed Sequence Support in NeMo AutoModel
-NeMo AutoModel supports **packed sequences**, a technique to optimize training with variable-length sequences (e.g., text) by minimizing padding.
-
-### What is a Packed Sequence?
-Instead of padding each sequence to a fixed length (wasting computation on `[PAD]` tokens), packed sequences:
-- Concatenate short sequences into a single continuous sequence.
-- Separate sequences with special tokens (e.g., `[EOS]`).
-- Track lengths via a "attention mask" to prevent cross-sequence information leakage.
-
-### Benefits
-- Reduces redundant computation on padding tokens leading to faster training.
-- Enables larger effective batch sizes leading to better GPU utilization.
-- Especially useful for language modeling and text datasets.
-
-
-### Enable Packed Sequences in NeMo Automodel
-
-To enable packed sequences, add these keys to your recipe's YAML config:
-```
-packed_sequence:
-   # Set packed_sequence_size > 0 to run with packed sequences
-   packed_sequence_size: 1024
-   split_across_pack: False
-```
-
-The `packed_sequence` has two options:
-- **packed_sequence_size**: Defines the total token length of each packed sequence, higher values require higher GPU memory usage.
-- **split_across_pack**: If two will split a sequence across different packed sequences.
-
-
-## Troubleshooting Tips
-
-- **Tokenization Mismatch?** Ensure your tokenizer aligns with the model's expected inputs.
-- **Dataset too large?** Use `limit_dataset_samples` in your YAML config to load a subset, useful for quick debugging.
-- **Loss not decreasing?** Verify that your loss mask correctly ignores prompt tokens.
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/guides/dataset-overview.md
-```md
-# Dataset Overview: LLM, VLM, and Retrieval Datasets
-
-This page summarizes the datasets supported in NeMo AutoModel for LLM, VLM, and retrieval training and shows how to plug in your own datasets using Python functions or the YAML `_target_` mechanism.
-
-- See also: [LLM datasets](llm/dataset.md), [VLM datasets](vlm/dataset.md), and [Retrieval dataset](llm/retrieval-dataset.md) for deeper, task-specific guides.
-
-- If a dataset you need is missing, please open a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues) with a short description and example schema so we can prioritize support.
----
-
-## LLM Datasets
-
-NeMo AutoModel supports several common patterns for language modeling and instruction tuning.
-### HellaSwag (Completion SFT)
-- Wrapper: `nemo_automodel.components.datasets.llm.hellaswag.HellaSwag`
-- Use case: single-turn completion-style SFT where a prompt (ctx) is followed by a gold continuation (ending)
-- Key args: `path_or_dataset`, `split`, `num_samples_limit`
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-
-### SQuAD-Style Question Answering (QA) (Instruction SFT)
-- Factory: `nemo_automodel.components.datasets.llm.squad.make_squad_dataset`
-- Use case: instruction/QA tuning with either prompt-and-answer formatting or chat-template formatting
-:::{note}
-- If the tokenizer has a chat template and you want answer-only loss, you must provide `start_of_turn_token`.
-- Optional `seq_length` can be used for padding/truncation.
-:::
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  split: train
-  dataset_name: rajpurkar/squad
-  start_of_turn_token: "<|assistant|>"
-```
-
-- **ColumnMappedTextInstructionDataset (generic instruction SFT)**
-  - Class: `nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset`
-  - Use case: quickly adapt instruction datasets by mapping your schema's columns to `context`, `question`, `answer`
-  - Sources: local JSON/JSONL or Hugging Face Hub dataset ID
-  - Notes:
-    - For tokenizers with chat templates and answer-only loss, you may set `answer_only_loss_mask: true` and provide `start_of_turn_token`.
-    - Supports streaming mode for large datasets (see [Streaming Datasets](#streaming-datasets) section below).
-    - Map-style, non-streaming dataset (supports `len(ds)` and `ds[i]`)
-    - For streaming (including Delta Lake / Databricks), use `ColumnMappedTextInstructionIterableDataset`
-  - Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id: Muennighoff/natural-instructions
-  split: train
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-```
-See the detailed guide, [Column-Mapped Text Instruction Dataset](llm/column-mapped-text-instruction-dataset.md), for more information.
-
-- **ChatDataset (multi-turn conversations and tool calling)**
-  - Class: `nemo_automodel.components.datasets.llm.ChatDataset`
-  - Use case: multi-turn conversations and tool calling in OpenAI chat format
-  - Sources: local JSON/JSONL or Hugging Face Hub dataset ID
-  - Key args:
-    - `path_or_dataset_id`: path to local file(s) or HuggingFace dataset ID
-    - `tokenizer`: tokenizer instance (required. Must have chat template support)
-    - `split`: dataset split (e.g., "train", "validation")
-    - `name`: dataset configuration/subset name
-    - `seq_length`: maximum sequence length for padding/truncation
-    - `padding`: padding strategy ("do_not_pad", "max_length", etc.)
-    - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.)
-    - `start_of_turn_token`: token marking assistant response start (for answer-only loss)
-    - `chat_template`: optional override for tokenizer's chat template
-    - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line
-  - Notes:
-    - Requires a tokenizer with chat template support
-    - Supports both single-turn and multi-turn tool calling
-    - Tool definitions are provided in a `tools` field at the conversation level
-    - Tool calls appear in assistant messages via `tool_calls` field
-    - Tool responses use the `tool` role
-### ChatDataset (Multi-Turn Conversations and Tool Calling)
-- Class: `nemo_automodel.components.datasets.llm.ChatDataset`
-- Use case: multi-turn conversations and tool calling in OpenAI chat format
-- Sources: local JSON/JSONL or Hugging Face Hub dataset ID
-- Key args:
-  - `path_or_dataset_id`: path to local file(s) or Hugging Face dataset ID
-  - `tokenizer`: tokenizer instance (required; must have chat template support)
-  - `split`: dataset split (e.g., "train", "validation")
-  - `name`: dataset configuration/subset name
-  - `seq_length`: maximum sequence length for padding/truncation
-  - `padding`: padding strategy ("do_not_pad", "max_length", etc.)
-  - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.)
-  - `start_of_turn_token`: token marking assistant response start (for answer-only loss)
-  - `chat_template`: optional override for tokenizer's chat template
-  - `mask_reasoning_content`: optionally exclude rendered `reasoning_content` tokens from loss
-  - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line
-:::{note}
-- Requires a tokenizer with chat template support
-- Supports both single-turn and multi-turn tool calling
-- Assistant messages may also include `reasoning_content` for structured reasoning traces
-- Tool definitions are provided in a `tools` field at the conversation level
-- Tool calls appear in assistant messages through the `tool_calls` field
-- Tool responses use the `tool` role and must include `tool_call_id`
-- If your dataset contains `reasoning_content`, your chat template must render it explicitly or it will be dropped
-- For multi-turn tool-calling datasets, prefer chat templates that use `{% generation %}` blocks so assistant-turn loss masking is exact
-- Set `mask_reasoning_content: true` if you want to train on the final assistant answer while excluding rendered reasoning traces from loss
-- Set `skip_invalid_samples: true` for noisy local JSONL so lines that are not valid JSON are skipped instead of failing the load
-:::
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.ChatDataset
-  path_or_dataset_id: Salesforce/xlam-function-calling-60k
-  split: train
-  tokenizer:
-    _target_: transformers.AutoTokenizer.from_pretrained
-    pretrained_model_name_or_path: google/functiongemma-270m-it
-  seq_length: 2048
-  start_of_turn_token: "<start_of_turn>"
-  mask_reasoning_content: false
-  skip_invalid_samples: false
-```
-  - Expected data format (OpenAI messages format):
-```json
-{
-  "messages": [
-    {
-      "role": "system",
-      "content": "You are a helpful assistant."
-    },
-    {
-      "role": "user",
-      "content": "What's the weather in Seattle and should I bring an umbrella?"
-    },
-    {
-      "role": "assistant",
-      "reasoning_content": "The user wants weather info and advice. I should call get_weather first, then decide whether an umbrella is needed.",
-      "content": "",
-      "tool_calls": [
-        {
-          "id": "call_1",
-          "type": "function",
-          "function": {
-            "name": "get_weather",
-            "arguments": "{\"city\": \"Seattle\"}"
-          }
-        }
-      ]
-    },
-    {
-      "role": "tool",
-      "tool_call_id": "call_1",
-      "content": "{\"temperature\": 55, \"condition\": \"rain\", \"precipitation_chance\": 0.85}"
-    },
-    {
-      "role": "assistant",
-      "reasoning_content": "It is raining with a high precipitation chance, so I should recommend bringing an umbrella.",
-      "content": "It's currently 55 degrees F and raining in Seattle with an 85% chance of continued precipitation. Yes, definitely bring an umbrella."
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "get_weather",
-        "description": "Get current weather for a city",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "city": {"type": "string"}
-          },
-          "required": ["city"]
-        }
-      }
-    }
-  ]
-}
-```
-  - Template requirement example for `reasoning_content`:
-```jinja
-{%- if message.reasoning_content %}
-{% generation %}
-{{ "<think>\n" + message.reasoning_content + "\n</think>\n" }}
-{% endgeneration %}
-{%- endif %}
-{% generation %}
-{{ message.content }}
-{% endgeneration %}
-```
-  - For single-turn tool calling (one tool call per conversation), omit the tool response and final assistant message:
-```json
-{
-  "messages": [
-    {
-      "role": "user",
-      "content": "Book a table for two at 7pm in Seattle."
-    },
-    {
-      "role": "assistant",
-      "content": "",
-      "tool_calls": [
-        {
-          "id": "call_1",
-          "type": "function",
-          "function": {
-            "name": "book_table",
-            "arguments": "{\"party_size\": 2, \"time\": \"19:00\", \"city\": \"Seattle\"}"
-          }
-        }
-      ]
-    }
-  ],
-  "tools": [
-    {
-      "type": "function",
-      "function": {
-        "name": "book_table",
-        "description": "Book a restaurant table",
-        "parameters": {
-          "type": "object",
-          "properties": {
-            "party_size": {"type": "integer"},
-            "time": {"type": "string"},
-            "city": {"type": "string"}
-          }
-        }
-      }
-    }
-  ]
-}
-```
-See the [Function Calling guide](llm/toolcalling.md) for an end-to-end example with FunctionGemma.
-For a small reasoning-style chat SFT starting point, see [qwen2_5_0p5b_instruct_fineproofs_chat.yaml](../../examples/llm_finetune/qwen/qwen2_5_0p5b_instruct_fineproofs_chat.yaml).
-
-### Retrieval (Embedding Fine-Tuning)
-- Factory: `nemo_automodel.components.datasets.llm.make_retrieval_dataset`
-- Collator: `nemo_automodel.components.datasets.llm.BiEncoderCollator`
-- Use case: embedding model fine-tuning with (query, positive doc, negative docs) contrastive learning
-- Supported schemas:
-  - Corpus-ID JSON (Merlin/NeMo-retriever style)
-  - Inline-text JSONL (e.g., `{"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]}`)
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset
-  data_dir_list: /abs/path/to/train.jsonl
-  data_type: train
-  n_passages: 5
-collate_fn:
-  _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator
-  q_max_len: 512
-  p_max_len: 512
-```
-See the detailed guide, [Retrieval dataset](llm/retrieval-dataset.md), for more information.
-
-### NanoGPT Binary Shards (Pretraining)
-- Class: `nemo_automodel.components.datasets.llm.nanogpt_dataset.NanogptDataset`
-- Use case: token-level LM pretraining over `.bin` shards produced by NanoGPT-style preprocessors (supports legacy and current formats)
-:::{note}
-- Streams contiguous `seq_len` slices, supports optional BOS alignment and `.bos.idx` sidecar files
-- Related tool: `tools/nanogpt_data_processor.py`
-:::
-
-### Megatron (Pretraining; Interoperable With Pre-Tokenized Megatron Data)
-- Class: `nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining`
-- Use case: large-scale LM pretraining over Megatron-LM formatted tokenized corpora
-- Interoperability: If your corpus has already been tokenized/indexed for Megatron (i.e., `.bin`/`.idx` pairs), you can point AutoModel to those assets directly. No re-tokenization required.
-- Key args: `paths` (single path, glob, weighted list, or per-split dict), `seq_length`, `tokenizer`, `split`, `index_mapping_dir`, `splits_to_build`
-- Example YAML:
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining
-  paths: /abs/path/to/processed_data_*_text_document*  # glob or explicit list
-  index_mapping_dir: /abs/path/to/mapping_dir
-  tokenizer:
-    _target_: transformers.AutoTokenizer.from_pretrained
-    pretrained_model_name_or_path: openai-community/gpt2
-  seq_length: 1024
-  split: "0.99, 0.01, 0.00"  # train, validation, test
-  splits_to_build: "train"
-```
-See the detailed [pretraining guide](llm/pretraining.md), which uses MegatronPretraining data.
-
-## Streaming Datasets
-
-Streaming datasets enable processing very large datasets without loading them entirely into memory. This is particularly useful when working with datasets that exceed available RAM or when you want to start training immediately without waiting for the full dataset to download.
-
-### What Are Streaming Datasets?
-
-Streaming datasets load and process data incrementally, one batch at a time, rather than loading the entire dataset into memory upfront. This approach:
-
-- **Reduces memory footprint**: Only the current batch resides in memory
-- **Enables training on massive datasets**: Process terabyte-scale datasets on machines with limited RAM
-- **Faster startup**: Begin training immediately without waiting for full dataset download
-- **Efficient for remote datasets**: Stream directly from Hugging Face Hub without local storage
-
-### When to Use Streaming
-
-Use streaming mode when:
-
-- Your dataset is very large (hundreds of GB or TB)
-- Available memory is limited compared to dataset size
-- You want to start training quickly without downloading the full dataset
-- You're experimenting with a subset of a large dataset
-
-Avoid streaming when:
-
-- Your dataset is small enough to fit comfortably in memory
-- You need random access to samples (e.g., for certain sampling strategies)
-- You need to know the exact dataset length upfront
-- Training requires multiple passes with different orderings
-
-### How to Enable Streaming
-
-For `ColumnMappedTextInstructionDataset`, use the streaming variant by changing the class to `ColumnMappedTextInstructionIterableDataset`:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: Muennighoff/natural-instructions
-  split: train
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-```
-
-For Hugging Face datasets loaded directly, set `streaming=True`:
-
-```python
-from datasets import load_dataset
-
-# Non-streaming (loads entire dataset into memory)
-dataset = load_dataset("large-dataset/corpus", split="train", streaming=False)
-
-# Streaming (loads data incrementally)
-dataset = load_dataset("large-dataset/corpus", split="train", streaming=True)
-```
-
-### Streaming Limitations
-
-When using streaming datasets, be aware of these limitations:
-
-1. **No random access**: You cannot use `dataset[index]` to access specific samples. Streaming datasets only support iteration.
-
-2. **No length information**: The `len(dataset)` operation is not available. You cannot determine the total number of samples upfront.
-
-3. **Single-pass iteration**: Each iteration consumes the stream. To iterate multiple times, you need to recreate the dataset or use the `repeat_on_exhaustion` parameter.
-
-4. **Limited shuffling**: Shuffling is done with a buffer (not the entire dataset), which may not provide perfect randomization.
-
-### Distributed Training with Streaming
-
-Streaming datasets support distributed training through sharding:
-
-```python
-from nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset import (
-    ColumnMappedTextInstructionIterableDataset
-)
-
-dataset = ColumnMappedTextInstructionIterableDataset(
-    path_or_dataset_id="large-dataset/corpus",
-    column_mapping={"question": "input", "answer": "output"},
-    tokenizer=tokenizer,
-)
-
-# Shard the dataset across workers
-dataset = dataset.shard(num_shards=8, index=worker_id)
-
-# Enable shuffling with a buffer
-dataset = dataset.shuffle(buffer_size=10000, seed=42)
-
-# Set epoch for deterministic shuffling across epochs
-dataset.set_epoch(epoch_num)
-```
-
-### Performance Considerations
-
-**Memory vs. Speed Trade-offs**:
-- Streaming reduces memory usage but may be slower than in-memory datasets
-- Network latency can impact streaming performance for remote datasets
-- Use local caching when repeatedly accessing the same remote dataset
-
-**Buffer Size for Shuffling**:
-- Larger buffers provide better randomization but use more memory
-- A buffer size of 10,000-100,000 samples is typically a good balance
-- For perfect shuffling, you need a buffer size equal to the dataset size (defeating the purpose of streaming)
-
-**Prefetching**:
-- Most streaming implementations prefetch data in the background
-- This helps hide network latency and keeps GPUs busy
-- Adjust prefetch settings based on your network speed and batch size
-
-### Example: Streaming a Large Dataset
-
-Here's a complete example of using streaming for a large instruction-tuning dataset:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_iterable_dataset.ColumnMappedTextInstructionIterableDataset
-  path_or_dataset_id: HuggingFaceH4/ultrachat_200k
-  split: train_sft
-  column_mapping:
-    question: prompt
-    answer: completion
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-  repeat_on_exhaustion: true  # Automatically restart when stream ends
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  batch_size: 4
-  num_workers: 4
-```
-
-This configuration:
-- Streams the dataset without loading it fully into memory
-- Automatically repeats when the stream is exhausted
-- Uses multiple workers for efficient data loading
-- Applies answer-only loss masking during tokenization
-
-## Packed Sequence Support
-To reduce padding and improve throughput with variable-length sequences:
-```yaml
-packed_sequence:
-  packed_sequence_size: 8192   # > 0 enables packing
-  split_across_pack: false
-```
-Use a collator that pads to an FP8-friendly multiple when training with FP8:
-```yaml
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-    pad_seq_len_divisible: 16
-```
-
----
-
-## VLM Datasets (Vision/Audio + Language)
-VLM datasets are represented as conversations (message lists) that combine text with images or audio and are processed with the model's `AutoProcessor.apply_chat_template` and a suitable collate function.
-
-Built-in dataset makers (return lists of `conversation` dicts):
-- **RDR items**: `nemo_automodel.components.datasets.vlm.datasets.make_rdr_dataset` (HF: `quintend/rdr-items`)
-- **CORD-V2 receipts (Consolidated Receipt Dataset for Post-OCR Parsing)**: `nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset` (HF: `naver-clova-ix/cord-v2`)
-- **MedPix-VQA (Medical Pixel Question Answering)**: `nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset`
-- **CommonVoice 17 (CV17) (audio)**: `nemo_automodel.components.datasets.vlm.datasets.make_cv17_dataset`
-
-
-Each example follows the conversation schema expected by `apply_chat_template`, e.g.:
-```python
-{
-  "conversation": [
-    {
-      "role": "user",
-      "content": [
-        {"type": "image", "image": example_image},
-        {"type": "text",  "text":  "Describe this image."}
-      ]
-    },
-    {
-      "role": "assistant",
-      "content": [{"type": "text", "text": ground_truth_text}]
-    }
-  ]
-}
-```
-
-### Custom Chat Template
-By default, VLM fine-tuning uses the chat template built into the model's `AutoProcessor`. To override it, add `chat_template` under `dataset:` in your YAML config:
-
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
-  split: train
-  chat_template: "{% for msg in messages %}{{ msg.role }}: {{ msg.content }}\n{% endfor %}"
-```
-
-`chat_template` accepts a Jinja template string, a path to a `.jinja` file, or a path to a JSON file containing a `chat_template` key. The override is applied to both the processor and its tokenizer before dataset instantiation.
-
-### Collate Functions
-- `nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn`
-- `nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn` (Qwen2.5 VL)
-- `nemo_automodel.components.datasets.vlm.collate_fns.phi4_mm_collate_fn` (audio)
-
-Select in your YAML:
-```yaml
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  batch_size: 1
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.vlm.collate_fns.qwen2_5_collate_fn
-```
-If you want answer-only loss masking, provide a model-appropriate `start_of_response_token` to the collate function.
-
-See [Gemma-3n](omni/gemma3-3n.md) and [VLM dataset](vlm/dataset.md) for end-to-end examples.
-
----
-
-## Diffusion Datasets
-
-Diffusion models don't train directly on raw images or videos. Instead, the data is first encoded into a compact numerical representation called a latent — this is what the model actually learns from. Text captions are similarly converted into text embeddings that the model uses as conditioning.
-
-This encoding is done once during preprocessing, and the results are saved as cache files (.meta). Training then reads these cache files directly, which is significantly faster than re-encoding on every step.
-
-The built-in preprocessing tool ([`tools/diffusion/preprocessing_multiprocess.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/tools/diffusion/preprocessing_multiprocess.py)) handles this conversion. It uses a VAE (Variational Autoencoder) to encode visual data and a text encoder for captions, grouping outputs into resolution-bucketed directories compatible with the multiresolution dataloader.
-
-### Dataloader Builders
-
-- **Video (T2V)**: `nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader` — for Wan 2.1 and HunyuanVideo
-- **Image (T2I)**: `nemo_automodel.components.datasets.diffusion.build_text_to_image_multiresolution_dataloader` — for FLUX.1-dev
-
-### Example YAML (Video Dataloader)
-
-```yaml
-data:
-  dataloader:
-    _target_: nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader
-    cache_dir: /path/to/processed_meta
-    model_type: wan
-    base_resolution: [512, 512]
-    dynamic_batch_size: false
-    shuffle: true
-    drop_last: false
-    num_workers: 0
-```
-
-See the [Diffusion Dataset Preparation](diffusion/dataset.md) guide for full preprocessing instructions and configuration details.
-
----
-
-## Bring Your Own Dataset
-You can integrate custom datasets with zero code changes to NeMo AutoModel by using `_target_` in YAML. There are three approaches:
-
-### Point to an Existing Class or Function (Dotted Path)
-- LLM example (class):
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
-  path_or_dataset: rowan/hellaswag
-  split: train
-```
-- LLM example (factory function):
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  split: train
-  dataset_name: rajpurkar/squad
-```
-- VLM example (factory function):
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.vlm.datasets.make_cord_v2_dataset
-  split: train
-```
-
-### Point to a Local Python File and Function
-```yaml
-dataset:
-  _target_: /abs/path/to/my_custom_dataset.py:build_my_dataset
-  some_arg: 123
-  split: train
-```
-Where `build_my_dataset` returns either a `datasets.Dataset` or a list/iterator of conversation dicts (for VLM).
-
-### Use ColumnMappedTextInstructionDataset for Most Instruction Datasets (LLM)
-- Ideal when your data has columns like `instruction`, `input`, or `output` but with arbitrary names
-- Supports local JSON/JSONL and HF Hub
-```yaml
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id: /abs/path/to/*.jsonl  # or org/repo on HF
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-  answer_only_loss_mask: true
-  start_of_turn_token: "<|assistant|>"
-```
-
-### Implement a Minimal Custom Class Pattern (LLM Completion)
-If you prefer Python, implement `get_context` and `get_target` and reuse the built-in preprocessor:
-```python
-from datasets import load_dataset
-from nemo_automodel.components.datasets.utils import SFTSingleTurnPreprocessor
-
-class MyCompletionDataset:
-    def __init__(self, path_or_dataset, tokenizer, split="train"):
-        raw_ds = load_dataset(path_or_dataset, split=split)
-        self.dataset = SFTSingleTurnPreprocessor(tokenizer).process(raw_ds, self)
-
-    def get_context(self, examples):
-        return examples["my_context_field"]
-
-    def get_target(self, examples):
-        return examples["my_target_field"]
-```
-Then reference your class with `_target_` in YAML.
-
-### Important Considerations
-- **Chat templates**: If your tokenizer has a chat template and you want answer-only loss, provide the correct `start_of_turn_token` (LLM) or `start_of_response_token` (VLM collate functions).
-- **Padding for FP8**: If training with FP8, set `pad_seq_len_divisible: 16` in your collate function to align sequence lengths.
-- **Packed sequences**: Prefer packed sequences for throughput when fine-tuning LLMs on variable-length corpora.
-- **Validation**: You can define a separate `validation_dataset` and `validation_dataloader` block mirroring your training config.
-
-For detailed, end-to-end recipes, browse the example configs under [examples/llm_finetune/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune), [examples/llm_pretrain/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_pretrain), and [examples/vlm_finetune/](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/vlm_finetune).
-
-```
-
-File: /Users/mromeijn/src/Automodel/docs/guides/checkpointing.md
-```md
-# Checkpointing
-
-## Introduction
-
-During machine-learning experiments, the model-training routine regularly saves checkpoints. A checkpoint is a complete snapshot of a run that includes model weights, optimizer states, and other metadata required to resume training exactly where it left off. Writing these snapshots at regular intervals lets you recover quickly from crashes or pauses without losing progress.
-
-NeMo Automodel checkpoints capture the complete state of a distributed training run across multiple GPUs or nodes. This reduces memory overhead, improves GPU utilization, and allows training to be resumed with a different parallelism strategy.
-
-NeMo Automodel writes checkpoints in two formats: [Hugging Face Safetensors](https://github.com/huggingface/safetensors) and [PyTorch Distributed Checkpointing (DCP)](https://docs.pytorch.org/docs/stable/distributed.checkpoint.html). It also supports two layouts:
-
-- **Consolidated Checkpoints**: The complete model state is saved as a Hugging Face-compatible bundle, typically in a single file or a compact set of files with an index. Because tensors are not split across GPUs (unsharded), tools like Hugging Face, vLLM, and SGLang can load these checkpoints directly.
-
-- **Sharded Checkpoints**: During distributed training with parameter sharing, each GPU holds a subset (or "shard") of the full state, such as model weights and optimizer states. When checkpointing, each GPU writes its own shard independently without reconstructing the full model state.
-
-We provide an overview of the different types of available checkpoint formats in the table below.
-
-Task | Model domain  | DCP (sharded) | Safetensors (sharded) | Safetensors (consolidated) |
------|----------------------|:-----------:|:-------------------:|:------------------------:|
-SFT  | LLM                  | ✅          | ✅                   | ✅                      |
-SFT  | VLM                  | ✅          | ✅                   | ✅                      |
-PEFT | LLM / VLM            | 🚧          | 🚧                   | ✅                      | 
-
-
-Changing between output formats can be done seamlessly through the recipe's `yaml` configuration file:
-```yaml
-checkpoint:
-    ...
-    model_save_format: safetensors # Format for saving (torch_save or safetensors)
-    save_consolidated: true # Change to false if you want to save sharded checkpoints.
-    ...
-```
-> **Note:** For optimal compatibility with the Hugging Face ecosystem, including downstream tools such as vLLM and SGLang, we recommend using the checkpoint configuration provided above.
-
-::: {note}
-The optimizer states are _always_ saved in DCP (`.distcp` extension) format.
-:::
-
-## Checkpoint Symbolic Links
-
-NeMo Automodel automatically creates symbolic links in the checkpoint directory to provide convenient access to important checkpoints:
-
-- **LATEST**: Points to the most recently saved checkpoint. This is useful for resuming training from the last saved state.
-- **LOWEST_VAL**: Points to the checkpoint with the lowest validation score/loss. This provides easy access to the best-performing checkpoint based on validation metrics, making it ideal for model evaluation or deployment.
-
-These symbolic links eliminate the need to manually track checkpoint names or search through directories to find the best model. When validation is enabled in your training run, both links are automatically maintained and updated as training progresses.
-
-## Safetensors
-To ensure seamless integration with the Hugging Face ecosystem, NeMo Automodel saves checkpoints in the [Safetensors](https://github.com/huggingface/safetensors) format. Safetensors is a memory-safe, zero-copy alternative to Python's pickle (PyTorch .bin), natively supported by Hugging Face Transformers, offering both safety and performance advantages over Python pickle-based approaches.
-
-### Key Benefits:
-- **Native Hugging Face Compatibility**: Checkpoints can be loaded directly into Hugging Face-compatible tools, including vLLM, SGLang, and others.
-- **Memory Safety and Speed**: The Safetensors format prohibits saving serialized Python code, ensuring memory safety, and supports zero-copy loading for improved performance.
-- **Optional Consolidation**: Sharded checkpoints can be merged into a standard Hugging Face model format for easier downstream use.
-
-**Most importantly**, this format offers the added advantage of optionally consolidating multiple shards into a complete Hugging Face format model.
-
-### Example
-
-The following command runs the LLM fine-tuning recipe on two GPUs and saves the resulting checkpoint in the Safetensors format:
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format safetensors \
-    --checkpoint.save_consolidated True
-```
-
-::: {note}
-In the above command we used the [`llama3_2_1b_squad.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/492add84a2b9d495946fe211c28973cd00051f3e/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml) config as a running example, adjust as necessary to your case.
-More config examples can be found in our [`examples/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples) directory.
-:::
-
-If you're running on a single GPU, you can run:
-```bash
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format safetensors \
-    --checkpoint.save_consolidated True
-```
-
-After running for a few seconds, the standard output should be:
-```
-...
-> Saving checkpoint to checkpoints/epoch_0_step_20
-...
-```
-
-The `checkpoints/` should have the following contents:
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-└── epoch_0_step_20
-   ├── model
-   │   ├── consolidated
-   │   │   ├── config.json
-   │   │   ├── generation_config.json
-   │   │   ├── model-00001-of-00001.safetensors
-   │   │   ├── model.safetensors.index.json
-   │   │   ├── special_tokens_map.json
-   │   │   ├── tokenizer.json
-   │   │   └── tokenizer_config.json
-   │   ├── shard-00001-model-00001-of-00001.safetensors
-   │   └── shard-00002-model-00001-of-00001.safetensors
-   └── optim
-       ├── __0_0.distcp
-       └── __1_0.distcp
-...
-```
-
-The `epoch_0_step_20/` directory stores the full training state from step `20` of the first epoch, including both the model and optimizer states.
-
-We can load and run the consolidated checkpoint using the Hugging Face Transformers API directly:
-```python
-import torch
-from transformers import pipeline
-
-model_id = "checkpoints/epoch_0_step_20/model/consolidated/"
-pipe = pipeline(
-    "text-generation", 
-    model=model_id, 
-    torch_dtype=torch.bfloat16, 
-    device_map="auto",
-)
-
-print(pipe("The key to life is"))
-
->>> [{'generated_text': 'The key to life is to be happy. The key to happiness is to be kind. The key to kindness is to be'}]
-```
-
-Although this example uses the Hugging Face Transformers API, the `consolidated/` checkpoint is compatible with any Hugging Face-compatible tool, such as vLLM, SGLang, and others.
-
-
-## PEFT
-When training with Parameter-Efficient Fine-Tuning (PEFT) techniques, only a small subset of model weights are updated — the rest of the model remains frozen. This dramatically reduces the size of the checkpoint, often to just a few megabytes.
-
-### Why Consolidated Checkpoints?
-Because the PEFT state is so lightweight, sharded checkpointing adds unnecessary overhead. Instead, NeMo Automodel automatically saves a single, consolidated Hugging Face–compatible checkpoint when using PEFT. This makes it:
-
-- easier to manage and share (just the adapters),
-- compatible with Hugging Face Transformers out of the box,
-- ideal for deployment and downstream evaluation.
-
-### Example: PEFT Fine-Tuning on Two GPUs
-
-To fine-tune a model using PEFT and save a Hugging Face–ready checkpoint:
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag_peft.yaml --step_scheduler.ckpt_every_steps 20 --checkpoint.model_save_format safetensors
-```
-
-After training, you'll get a compact, consolidated Safetensors checkpoint that can be loaded directly with Hugging Face tools:
-
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-├── epoch_0_step_20
-│   ├── config.yaml
-│   ├── dataloader
-│   │   ├── dataloader_dp_rank_0.pt
-│   │   └── dataloader_dp_rank_1.pt
-│   ├── losses.json
-│   ├── model
-│   │   ├── adapter_config.json
-│   │   ├── adapter_model.safetensors
-│   │   ├── automodel_peft_config.json
-│   │   ├── special_tokens_map.json
-│   │   ├── tokenizer.json
-│   │   └── tokenizer_config.json
-│   ├── optim
-│   │   ├── __0_0.distcp
-│   │   └── __1_0.distcp
-│   ├── rng
-│   │   ├── rng_dp_rank_0.pt
-│   │   └── rng_dp_rank_1.pt
-│   └── step_scheduler.pt
-├── training.jsonl
-└── validation.jsonl
-```
-
-The example below showcases the direct compatibility of NeMo Automodel with Hugging Face and PEFT:
-```python
-from peft import AutoPeftModelForCausalLM
-from transformers import AutoTokenizer
-
-checkpoint_path = "checkpoints/epoch_0_step_20/model/"
-model = AutoPeftModelForCausalLM.from_pretrained(checkpoint_path)
-tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
-
-model = model.to("cuda")
-model.eval()
-inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt")
-
-outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50)
-print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])
-
->>> Preheat the oven to 350 degrees and place the cookie dough in a large bowl. Roll the dough into 1-inch balls and place them on a cookie sheet. Bake the cookies for 10 minutes. While the cookies are baking, melt the chocolate chips in the microwave for 30 seconds.
-```
-
-## PyTorch DCP
-NeMo Automodel also offers native PyTorch DCP checkpointing support (`.distcp` extension). Similar to Safetensors, it also provides the same features of load-time resharding and parallel saving.
-
-As a simple example, we can run the following command to launch the training recipe on two GPUs.
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format torch_save
-
-...
-> Saving checkpoint to checkpoints/epoch_0_step_20
-...
-```
-After 20 steps, the following checkpoint will be saved:
-
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-└── epoch_0_step_20
-   ├── config.yaml
-   ├── dataloader
-   │   ├── dataloader_dp_rank_0.pt
-   │   └── dataloader_dp_rank_1.pt
-   ├── losses.json
-   ├── model
-   │   ├── __0_0.distcp
-   │   └── __1_0.distcp
-   └── optim
-       ├── __0_0.distcp
-       └── __1_0.distcp
-...
-```
-
-If you rerun the script, NeMo Automodel automatically detects and restores the most recent checkpoint.
-```bash
-automodel --nproc-per-node=2 examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --step_scheduler.ckpt_every_steps 20 \
-    --checkpoint.model_save_format torch_save
-
-...
-> Loading checkpoint from checkpoints/epoch_0_step_20
-...
-```
-
-## Saving Checkpoints When Using Docker
-
-When training inside a Docker container (see [Installation Guide](installation.md)), any files written to the container's filesystem are lost when the container exits (especially with `--rm`). To keep your checkpoints, you must **bind-mount a host directory** to the checkpoint path before starting the container:
-
-```bash
-docker run --gpus all -it --rm \
-  --shm-size=8g \
-  -v "$(pwd)"/checkpoints:/opt/Automodel/checkpoints \
-  nvcr.io/nvidia/nemo-automodel:25.11.00
-```
-
-You can also set a custom checkpoint directory via the YAML config or CLI override:
-```yaml
-checkpoint:
-  checkpoint_dir: /mnt/shared/my_checkpoints
-```
-```bash
-# Or via CLI override:
-automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
-    --checkpoint.checkpoint_dir /mnt/shared/my_checkpoints
-```
-
-When using a custom path, make sure the corresponding host directory is mounted into the container with `-v`.
-
-::: {tip}
-Mount additional host directories for datasets and the Hugging Face model cache to avoid re-downloading large models across container restarts. See the [Installation Guide](installation.md) for a complete `docker run` example with all recommended mounts.
-:::
-
-## Asynchronous Checkpointing
-
-NeMo Automodel can write checkpoints asynchronously to reduce training stalls caused by I/O. When enabled, checkpoint writes are scheduled in the background using PyTorch Distributed Checkpointing's async API while training continues.
-
-- **Enable** (YAML):
-  ```yaml
-  checkpoint:
-    is_async: true
-  ```
-- **Enable** (CLI): add `--checkpoint.is_async True` to your run command.
-- **Requirements**: PyTorch ≥ 2.9.0. If an older version is detected, async mode is automatically disabled.
-- **Behavior**: At most one checkpoint uploads at a time; the next save waits for the previous upload to finish. The `LATEST` symlink is updated after the async save completes (may be deferred until the next save call). During PEFT, adapter model files are written synchronously on rank 0; optimizer states can still use async.
-
-## Advanced Usage: Save Additional States
-You can also save additional states in NeMo Automodel. By default, we also automatically checkpoint the `dataloader`, `rng`, and `step_scheduler` states which are necessary to resume training accurately. In full, a Safetensors consolidated checkpoint will look like this:
-
-```
-checkpoints/
-├── LATEST -> epoch_0_step_20
-├── LOWEST_VAL -> epoch_0_step_20
-├── epoch_0_step_20
-│   ├── config.yaml
-│   ├── dataloader
-│   │   ├── dataloader_dp_rank_0.pt
-│   │   └── dataloader_dp_rank_1.pt
-│   ├── losses.json
-│   ├── model
-│   │   ├── consolidated
-│   │   │   ├── config.json
-│   │   │   ├── generation_config.json
-│   │   │   ├── model-00001-of-00001.safetensors
-│   │   │   ├── model.safetensors.index.json
-│   │   │   ├── special_tokens_map.json
-│   │   │   ├── tokenizer.json
-│   │   │   └── tokenizer_config.json
-│   │   ├── shard-00001-model-00001-of-00001.safetensors
-│   │   └── shard-00002-model-00001-of-00001.safetensors
-│   ├── optim
-│   │   ├── __0_0.distcp
-│   │   └── __1_0.distcp
-│   ├── rng
-│   │   ├── rng_dp_rank_0.pt
-│   │   └── rng_dp_rank_1.pt
-│   └── step_scheduler.pt
-├── training.jsonl
-└── validation.jsonl
-```
-
-If you want to define a new state to be checkpointed in the recipe, the easiest way is to create a new attribute in the recipe class (defined using `self.` inside the recipe). Just make sure that the new attribute uses both the `load_state_dict` and `state_dict` methods.
-
-Here is an example of what it might look like:
-
-```python
-
-class NewState:
-
-    def __init__(self, ...):
-        self.state_value = ...
-        self.another_value = ...
-        ...
-    
-    def state_dict(self) -> dict[str, Any]:
-        return {
-            "<some state you're tracking>": self.state_value,
-            "<another state you're tracking>": self.another_value,
-        }
-    
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        self.state_value = state_dict["<some state you're tracking>"]
-        self.another_value = state_dict["<another state you're tracking>"]
-```
-
-Inside your recipe class, define the new state as an instance attribute using `self.new_state = NewState(...)`.
-
-```
-
-File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# To run this recipe:
-#   automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml --nproc-per-node 8
-# Adjust --nproc-per-node to the number of GPUs available on your machine.
-
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  ckpt_every_steps: 1000
-  val_every_steps: 10  # will run every x number of gradient steps
-  num_epochs: 1
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111
-  ranked: true
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-# torch.compile configuration
-compile:
-  enabled: false
-  mode: "default"  # Options: "default", "reduce-overhead", "max-autotune"
-  fullgraph: false
-  dynamic: true  # Set to false for better performance with fixed shapes
-  backend: null  # Use default backend (inductor)
-
-clip_grad_norm:
-  max_norm: 1.0
-
-distributed:
-  strategy: fsdp2
-  dp_size: none
-  tp_size: 1
-  cp_size: 1
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-packed_sequence:
-  packed_sequence_size: 0
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-  shuffle: false
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-  limit_dataset_samples: 64
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn:
-    _target_: nemo_automodel.components.datasets.utils.default_collater
-
-optimizer:
-  _target_: torch.optim.Adam
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0
-  # min_lr: 1.0e-5
-
-lr_scheduler:
-  lr_decay_style: cosine
-  min_lr: 1.0e-6
-
-# Uncomment and configure for W&B logging
-# wandb:
-#   project: <your_wandb_project>
-#   entity: <your_wandb_entity>
-#   name: <your_wandb_exp_name>
-#   save_dir: <your_wandb_save_dir>
-
-# Uncomment and configure for Mlflow logging
-# mlflow:
-#   experiment_name: "automodel-llm-llama3_2_1b_squad-finetune"
-#   run_name: ""
-#   tracking_uri: null
-#   artifact_location: null 
-#   tags:
-#     task: "squad-finetune"
-#     model_family: "llama3.2"
-#     model_size: "1b"
-#     dataset: "squad"
-#     framework: "automodel"
-
-ci:
-  recipe_owner: akoumpa
-
-```
-
-File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# To run this recipe:
-#   automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml --nproc-per-node 8
-# Adjust --nproc-per-node to the number of GPUs available on your machine.
-
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 64
-  local_batch_size: 8
-  ckpt_every_steps: 1000
-  val_every_steps: 10  # will run every x number of gradient steps
-  num_epochs: 1
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 1111
-  ranked: true
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  match_all_linear: True
-  dim: 8
-  alpha: 32
-  use_triton: True
-
-distributed:
-  strategy: fsdp2
-  dp_size: none
-  tp_size: 1
-  cp_size: 1
-
-  sequence_parallel: false
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: train
-
-packed_sequence:
-  packed_sequence_size: 0
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-  shuffle: false
-
-validation_dataset:
-  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
-  dataset_name: rajpurkar/squad
-  split: validation
-  limit_dataset_samples: 64
-
-validation_dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-
-optimizer:
-  _target_: torch.optim.Adam
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0
-  # min_lr: 1.0e-5
-
-lr_scheduler:
-  lr_decay_style: cosine
-  min_lr: 1.0e-6
-
-# Uncomment and configure for W&B logging
-# wandb:
-#   project: <your_wandb_project>
-#   entity: <your_wandb_entity>
-#   name: <your_wandb_exp_name>
-#   save_dir: <your_wandb_save_dir>
-
-ci:
-  recipe_owner: akoumpa
-
-```
-
-File: /Users/mromeijn/src/Automodel/examples/llm_finetune/llama3_1/llama3_1_8b_columnmapped_lora.yaml
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# QLora configuration for Llama-3.1-8B on SQuAD dataset
-# Uses 4-bit quantization with LoRA adapters
-#
-# To run this recipe:
-#   automodel examples/llm_finetune/llama3_1/llama3_1_8b_columnmapped_lora.yaml --nproc-per-node 8
-# Adjust --nproc-per-node to the number of GPUs available on your machine.
-
-recipe: TrainFinetuneRecipeForNextTokenPrediction
-
-step_scheduler:
-  global_batch_size: 32
-  local_batch_size: 4
-  ckpt_every_steps: 100
-  val_every_steps: 600
-  max_steps: 500
-
-dist_env:
-  backend: nccl
-  timeout_minutes: 1
-
-rng:
-  _target_: nemo_automodel.components.training.rng.StatefulRNG
-  seed: 42
-  ranked: true
-
-model:
-  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
-  pretrained_model_name_or_path: meta-llama/Llama-3.1-8B
-
-peft:
-  _target_: nemo_automodel.components._peft.lora.PeftConfig
-  match_all_linear: true
-  dim: 16
-  alpha: 32
-  dropout: 0.1
-
-distributed:
-  strategy: fsdp2
-  dp_size: none
-  tp_size: 1
-  cp_size: 1
-
-  sequence_parallel: false
-
-loss_fn:
-  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
-
-dataset:
-  _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-  path_or_dataset_id: Muennighoff/natural-instructions
-  split: train
-  column_mapping:
-    context: definition
-    question: inputs
-    answer: targets
-
-packed_sequence:
-  packed_sequence_size: 0
-
-dataloader:
-  _target_: torchdata.stateful_dataloader.StatefulDataLoader
-  collate_fn: nemo_automodel.components.datasets.utils.default_collater
-  shuffle: false
-
-# validation_dataset:
-#   _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
-#   path_or_dataset_id: Muennighoff/natural-instructions
-#   split: validation
-#   column_mapping:
-#     instruction: definition
-#     question: inputs
-#     answer: targets
-
-# validation_dataloader:
-#   _target_: torchdata.stateful_dataloader.StatefulDataLoader
-#   collate_fn: nemo_automodel.components.datasets.utils.default_collater
-#   shuffle: false
-
-optimizer:
-  _target_: torch.optim.AdamW
-  betas: [0.9, 0.999]
-  eps: 1e-8
-  lr: 1.0e-5
-  weight_decay: 0.01
-
-# Uncomment and configure for W&B logging
-# wandb:
-#   project: <your_wandb_project>
-#   entity: <your_wandb_entity>
-#   name: llama3_1_8b_squad_qlora
-#   save_dir: <your_wandb_save_dir> 
-
-ci:
-  recipe_owner: akoumpa
-  time: "00:15:00"
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/curator-data-acquisition.txt b/skills/nemotron-customize/context/curator-data-acquisition.txt
deleted file mode 100644
index c51ccba8b..000000000
--- a/skills/nemotron-customize/context/curator-data-acquisition.txt
+++ /dev/null
@@ -1,2905 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Curator
-├── docs
-│   ├── about
-│   │   ├── concepts
-│   │   │   ├── text
-│   │   │   │   ├── _images
-│   │   │   │   ├── data-acquisition-concepts.md *
-│   │   │   │   └── data-loading-concepts.md *
-│   │   │   ├── audio
-│   │   │   ├── image
-│   │   │   └── video
-│   │   │       └── _images
-│   │   └── release-notes
-│   ├── curate-text
-│   │   ├── load-data
-│   │   │   ├── common-crawl.md *
-│   │   │   ├── custom.md *
-│   │   │   ├── index.md *
-│   │   │   └── read-existing.md *
-│   │   ├── process-data
-│   │   │   ├── content-processing
-│   │   │   ├── deduplication
-│   │   │   ├── language-management
-│   │   │   ├── quality-assessment
-│   │   │   └── specialized-processing
-│   │   ├── synthetic
-│   │   │   └── nemotron-cc
-│   │   └── tutorials
-│   ├── _extensions
-│   │   ├── ai_assistant
-│   │   │   ├── assets
-│   │   │   │   └── styles
-│   │   │   ├── core
-│   │   │   ├── integrations
-│   │   │   └── ui
-│   │   ├── content_gating
-│   │   ├── json_output
-│   │   │   ├── content
-│   │   │   ├── core
-│   │   │   └── processing
-│   │   ├── rich_metadata
-│   │   │   └── templates
-│   │   └── search_assets
-│   │       ├── modules
-│   │       └── templates
-│   ├── _images
-│   ├── _templates
-│   ├── admin
-│   │   ├── deployment
-│   │   │   └── slurm
-│   │   └── integrations
-│   ├── curate-audio
-│   │   ├── load-data
-│   │   ├── process-data
-│   │   │   ├── asr-inference
-│   │   │   ├── audio-analysis
-│   │   │   ├── quality-assessment
-│   │   │   └── text-integration
-│   │   └── tutorials
-│   ├── curate-images
-│   │   ├── load-data
-│   │   ├── process-data
-│   │   │   ├── embeddings
-│   │   │   └── filters
-│   │   └── tutorials
-│   ├── curate-video
-│   │   ├── load-data
-│   │   ├── process-data
-│   │   └── tutorials
-│   │       ├── _images
-│   │       └── pipeline-customization
-│   ├── get-started
-│   └── reference
-│       └── infrastructure
-├── nemo_curator
-│   ├── stages
-│   │   ├── text
-│   │   │   ├── download
-│   │   │   │   ├── base
-│   │   │   │   │   ├── download.py * +
-│   │   │   │   │   ├── extract.py * +
-│   │   │   │   │   └── stage.py * +
-│   │   │   │   ├── common_crawl
-│   │   │   │   │   └── stage.py * +
-│   │   │   │   ├── arxiv
-│   │   │   │   ├── html_extractors
-│   │   │   │   │   └── utils
-│   │   │   │   └── wikipedia
-│   │   │   ├── classifiers
-│   │   │   ├── deduplication
-│   │   │   ├── embedders
-│   │   │   ├── filters
-│   │   │   │   ├── fasttext
-│   │   │   │   ├── heuristic
-│   │   │   │   │   ├── code
-│   │   │   │   │   └── repetition
-│   │   │   │   ├── histogram
-│   │   │   │   └── token
-│   │   │   ├── io
-│   │   │   │   ├── reader
-│   │   │   │   └── writer
-│   │   │   ├── models
-│   │   │   ├── modifiers
-│   │   │   │   ├── fasttext
-│   │   │   │   ├── string
-│   │   │   │   └── unicode
-│   │   │   ├── modules
-│   │   │   └── utils
-│   │   ├── audio
-│   │   │   ├── advanced_pipelines
-│   │   │   │   └── audio_data_filter
-│   │   │   ├── alm
-│   │   │   ├── datasets
-│   │   │   │   ├── fleurs
-│   │   │   │   └── readspeech
-│   │   │   ├── filtering
-│   │   │   │   ├── band_filter_module
-│   │   │   │   └── sigmos_filter_module
-│   │   │   │       └── third_party
-│   │   │   │           └── sigmos
-│   │   │   ├── inference
-│   │   │   ├── io
-│   │   │   ├── metrics
-│   │   │   ├── postprocessing
-│   │   │   ├── preprocessing
-│   │   │   └── segmentation
-│   │   │       └── speaker_separation_module
-│   │   ├── deduplication
-│   │   │   ├── exact
-│   │   │   ├── fuzzy
-│   │   │   │   └── lsh
-│   │   │   ├── semantic
-│   │   │   └── shuffle_utils
-│   │   ├── image
-│   │   │   ├── deduplication
-│   │   │   ├── embedders
-│   │   │   ├── filters
-│   │   │   └── io
-│   │   ├── interleaved
-│   │   │   ├── filter
-│   │   │   ├── io
-│   │   │   │   ├── readers
-│   │   │   │   └── writers
-│   │   │   ├── pdf
-│   │   │   │   └── nemotron_parse
-│   │   │   └── utils
-│   │   ├── math
-│   │   │   ├── classifiers
-│   │   │   ├── download
-│   │   │   │   └── html_extractors
-│   │   │   └── modifiers
-│   │   ├── synthetic
-│   │   │   ├── nemo_data_designer
-│   │   │   └── nemotron_cc
-│   │   │       └── nemo_data_designer
-│   │   └── video
-│   │       ├── caption
-│   │       ├── clipping
-│   │       ├── embedding
-│   │       ├── filtering
-│   │       ├── io
-│   │       └── preview
-│   ├── backends
-│   │   ├── internal
-│   │   │   └── raft
-│   │   ├── ray_actor_pool
-│   │   ├── ray_data
-│   │   └── xenna
-│   ├── config
-│   │   └── text
-│   ├── core
-│   ├── metrics
-│   ├── models
-│   │   └── client
-│   ├── pipeline
-│   ├── tasks
-│   └── utils
-├── tutorials
-│   ├── text
-│   │   ├── download-and-extract
-│   │   │   └── README.md *
-│   │   ├── llama-nemotron-data-curation
-│   │   │   ├── filters
-│   │   │   ├── utils
-│   │   │   ├── README.md *
-│   │   │   └── main.py * +
-│   │   ├── deduplication
-│   │   │   ├── fuzzy
-│   │   │   └── semantic
-│   │   ├── distributed-data-classification
-│   │   ├── gliner-pii-redaction
-│   │   ├── megatron-tokenizer
-│   │   ├── peft-curation
-│   │   └── tinystories
-│   ├── audio
-│   │   ├── alm
-│   │   ├── callhome_diar
-│   │   ├── fleurs
-│   │   ├── readspeech
-│   │   └── single_speaker_filter
-│   ├── image
-│   │   └── getting-started
-│   ├── interleaved
-│   │   └── nemotron_parse_pdf
-│   ├── math
-│   ├── multimodal
-│   ├── slurm
-│   ├── synthetic
-│   │   ├── nemo_data_designer
-│   │   └── nemotron_cc
-│   │       ├── example_data
-│   │       └── nemo_data_designer
-│   └── video
-│       └── getting-started
-├── .cursor
-│   └── rules
-├── .github
-│   ├── actions
-│   │   ├── build-container
-│   │   └── test-template
-│   ├── scripts
-│   └── workflows
-│       └── config
-├── benchmarking
-│   ├── data_prep
-│   ├── runner
-│   │   └── sinks
-│   ├── scripts
-│   └── tools
-├── docker
-│   └── common
-├── fern
-│   ├── assets
-│   │   └── images
-│   ├── components
-│   └── versions
-│       ├── v25.09
-│       │   └── pages
-│       │       ├── about
-│       │       │   ├── concepts
-│       │       │   │   ├── audio
-│       │       │   │   ├── image
-│       │       │   │   ├── text
-│       │       │   │   └── video
-│       │       │   └── release-notes
-│       │       ├── admin
-│       │       │   ├── deployment
-│       │       │   └── integrations
-│       │       ├── api-reference
-│       │       │   ├── executors
-│       │       │   └── tasks
-│       │       ├── curate-audio
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   │   ├── asr-inference
-│       │       │   │   ├── audio-analysis
-│       │       │   │   ├── quality-assessment
-│       │       │   │   └── text-integration
-│       │       │   └── tutorials
-│       │       ├── curate-images
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   │   ├── embeddings
-│       │       │   │   └── filters
-│       │       │   └── tutorials
-│       │       ├── curate-text
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   │   ├── content-processing
-│       │       │   │   ├── deduplication
-│       │       │   │   ├── language-management
-│       │       │   │   ├── quality-assessment
-│       │       │   │   └── specialized-processing
-│       │       │   └── tutorials
-│       │       ├── curate-video
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   └── tutorials
-│       │       │       └── pipeline-customization
-│       │       ├── get-started
-│       │       └── reference
-│       │           └── infrastructure
-│       └── v26.02
-│           └── pages
-│               ├── _images
-│               ├── about
-│               │   ├── concepts
-│               │   │   ├── audio
-│               │   │   ├── image
-│               │   │   ├── text
-│               │   │   │   └── _images
-│               │   │   └── video
-│               │   │       └── _images
-│               │   └── release-notes
-│               ├── admin
-│               │   ├── deployment
-│               │   │   └── slurm
-│               │   └── integrations
-│               ├── api-reference
-│               │   ├── executors
-│               │   └── tasks
-│               ├── curate-audio
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   │   ├── asr-inference
-│               │   │   ├── audio-analysis
-│               │   │   ├── quality-assessment
-│               │   │   └── text-integration
-│               │   └── tutorials
-│               ├── curate-images
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   │   ├── embeddings
-│               │   │   └── filters
-│               │   └── tutorials
-│               ├── curate-text
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   │   ├── content-processing
-│               │   │   ├── deduplication
-│               │   │   ├── language-management
-│               │   │   ├── quality-assessment
-│               │   │   └── specialized-processing
-│               │   ├── synthetic
-│               │   │   └── nemotron-cc
-│               │   └── tutorials
-│               ├── curate-video
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   └── tutorials
-│               │       ├── _images
-│               │       └── pipeline-customization
-│               ├── get-started
-│               └── reference
-│                   └── infrastructure
-└── tests
-    ├── backends
-    │   ├── ray_actor_pool
-    │   └── ray_data
-    ├── config
-    ├── core
-    ├── fixtures
-    │   └── audio
-    │       └── alm
-    │           └── nested_manifests
-    │               ├── subdir_a
-    │               └── subdir_b
-    ├── metrics
-    ├── models
-    │   └── client
-    ├── pipelines
-    ├── stages
-    │   ├── audio
-    │   │   ├── advanced_pipelines
-    │   │   ├── alm
-    │   │   ├── datasets
-    │   │   ├── filtering
-    │   │   ├── inference
-    │   │   ├── io
-    │   │   ├── metrics
-    │   │   ├── postprocessing
-    │   │   ├── preprocessing
-    │   │   └── segmentation
-    │   ├── common
-    │   ├── deduplication
-    │   │   ├── exact
-    │   │   ├── fuzzy
-    │   │   ├── semantic
-    │   │   └── shuffle_utils
-    │   ├── image
-    │   │   ├── dedup
-    │   │   ├── embedders
-    │   │   ├── filters
-    │   │   └── io
-    │   ├── interleaved
-    │   │   ├── filter
-    │   │   ├── pdf
-    │   │   │   └── nemotron_parse
-    │   │   └── utils
-    │   ├── math_stages
-    │   │   ├── classifiers
-    │   │   ├── download
-    │   │   └── modifiers
-    │   ├── synthetic
-    │   │   ├── nemo_data_designer
-    │   │   └── nemotron_cc
-    │   │       └── nemo_data_designer
-    │   ├── text
-    │   │   ├── classifiers
-    │   │   ├── deduplication
-    │   │   ├── download
-    │   │   │   ├── arxiv
-    │   │   │   ├── base
-    │   │   │   ├── common_crawl
-    │   │   │   └── wikipedia
-    │   │   ├── embedders
-    │   │   ├── io
-    │   │   │   ├── reader
-    │   │   │   └── writer
-    │   │   ├── models
-    │   │   └── modules
-    │   └── video
-    │       ├── caption
-    │       │   └── fixtures
-    │       ├── clipping
-    │       ├── embedding
-    │       ├── filtering
-    │       ├── io
-    │       └── preview
-    ├── tasks
-    └── utils
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; selected files shown.
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/base.py
-Imports:
-  - import contextlib
-  - import copy
-  - import time
-  - from abc import ABC, ABCMeta, abstractmethod
-  - from inspect import isabstract
-  - from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, final
-  - from loguru import logger
-  - from nemo_curator.stages.resources import Resources
-  - from nemo_curator.tasks import Task
-  - from nemo_curator.backends.base import NodeInfo, WorkerMetadata
----
-Classes:
-  - StageMeta
-    Methods:
-      - L46: def __new__(mcls, name, bases, namespace, **kwargs):
-  - ProcessingStage
-    Methods:
-      - L92: def _name(self) -> str:
-      - L97: def _resources(self) -> Resources:
-      - L102: def _batch_size(self) -> int | None:
-      - L106: def __init_subclass__(cls, **kwargs):
-      - L127: def num_workers(self) -> int | None:
-      - L131: def validate_input(self, task: Task) -> bool:
-      - L161: def process(self, task: X) -> Y | list[Y]:
-      - L171: def process_batch(self, tasks: list[X]) -> list[Y]:
-      - L201: def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None:
-      - L209: def setup(self, worker_metadata: WorkerMetadata | None = None) -> None:
-      - L217: def teardown(self) -> None:
-      - L222: def supports_batch_processing(self) -> bool:
-      - L230: def __repr__(self) -> str:
-      - L234: def inputs(self) -> tuple[list[str], list[str]]:
-      - L244: def outputs(self) -> tuple[list[str], list[str]]:
-      - L254: def xenna_stage_spec(self) -> dict[str, Any]:
-      - L262: def with_(
-        self,
-        name: str | None = None,
-        resources: Resources | None = None,
-        batch_size: int | None = None,
-        runtime_env: dict[str, Any] | None = None,
-    ) -> ProcessingStage:
-      - L293: def get_config(self) -> dict[str, Any]:
-      - L305: def ray_stage_spec(self) -> dict[str, Any]:
-      - L316: def _log_metrics(self, metrics: dict[str, float]) -> None:
-      - L327: def _log_metric(self, name: str, value: float) -> None:
-      - L331: def _time_metric(self, name: str) -> contextlib.AbstractContextManager[None]:
-      - L339: def _consume_custom_metrics(self) -> dict[str, float]:
-    Properties:
-      - _is_abstract_root
-      - name
-      - resources
-      - batch_size
-      - runtime_env
-  - CompositeStage
-    Methods:
-      - L359: def __init__(self):
-      - L362: def inputs(self) -> tuple[list[str], list[str]]:
-      - L366: def outputs(self) -> tuple[list[str], list[str]]:
-      - L371: def decompose(self) -> list[ProcessingStage]:
-      - L381: def with_(self, stage_with_dict: dict[str, Any]) -> CompositeStage:
-      - L387: def decompose_and_apply_with(self) -> list[ProcessingStage]:
-      - L391: def _apply_with_(self, stages: list[ProcessingStage]) -> list[ProcessingStage]:
-      - L419: def process(self, task: X) -> Y | list[Y]:
-      - L425: def get_description(self) -> str:
-
-Functions:
-  - L62: def get_stage_class(name: str) -> type[ProcessingStage]:
-
-Global vars:
-  - X
-  - Y
-  - _STAGE_REGISTRY
----
-
-
-File: /Users/mromeijn/src/Curator/nemo_curator/tasks/file_group.py
-Imports:
-  - from dataclasses import dataclass, field
-  - from typing import Any
-  - from loguru import logger
-  - from .tasks import Task
----
-Classes:
-  - FileGroupTask
-    Methods:
-      - L33: def num_items(self) -> int:
-      - L37: def validate(self) -> bool:
-    Properties:
-      - reader_config
-      - data
----
-
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Curator/docs/about/concepts/text/data-acquisition-concepts.md
-```md
----
-description: "Core concepts for acquiring text data from remote sources including DocumentDownloader, DocumentIterator, and DocumentExtractor components"
-categories: ["concepts-architecture"]
-tags: ["data-acquisition", "remote-sources", "download", "extract", "distributed"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "concept"
-modality: "text-only"
----
-
-(about-concepts-text-data-acquisition)=
-
-# Data Acquisition Concepts
-
-This guide covers the core concepts for acquiring and processing text data from remote sources in NeMo Curator. Data acquisition focuses on downloading, extracting, and converting remote data sources into the `DocumentBatch` format for further processing.
-
-## Overview
-
-Data acquisition in NeMo Curator follows a three-stage architecture:
-
-1. **Generate URLs**: Discover and generate download URLs from minimal input
-2. **Download**: Retrieve raw data files from remote sources
-3. **Iterate** and **Extract**: Extract individual records from downloaded containers and convert raw content to clean, structured text
-
-This process transforms diverse remote data sources into a standardized `DocumentBatch` that can be used throughout the text curation pipeline.
-
-## Core Components
-
-The data acquisition framework consists of four abstract base classes that define the acquisition workflow:
-
-### URLGenerator
-
-Generates URLs for downloading from minimal input configuration. You need to override `generate_urls` which generates a bunch of URLs that user wants to download.
-
-**Example Implementation**:
-
-```python
-from dataclasses import dataclass
-from nemo_curator.stages.text.download import URLGenerator
-
-@dataclass
-class CustomURLGenerator(URLGenerator):
-    def generate_urls(self):
-        # Custom URL generation logic
-        urls = []
-        ...
-        return urls
-```
-
-### DocumentDownloader
-
-Connects to and downloads data from remote repositories. You must override `_get_output_filename` and `_download_to_path` which are called by an underlying function called `download` which tries to be idempotent.
-
-**Example Implementation**:
-
-```python
-from nemo_curator.stages.text.download import DocumentDownloader
-
-class CustomDownloader(DocumentDownloader):
-    def __init__(self, download_dir: str):
-        super().__init__(download_dir=download_dir)
-    
-    def _get_output_filename(self, url: str) -> str:
-        # Custom logic to extract filename from URL
-        return url.split("/")[-1]
-    
-    def _download_to_path(self, url: str, path: str) -> tuple[bool, str | None]:
-        # Custom download logic
-        # Return (success_bool, error_message)
-        try:
-            # ... download implementation ...
-            return True, None
-        except Exception as e:
-            return False, str(e)
-```
-
-### DocumentIterator
-
-Extracts individual records from downloaded containers. You should only override `iterate` and `output_columns` where `iterate` must have logic to load the local file path and return bunch of documents. The `list[dict]` is finally considered to a Pandas DataFrame which is passed to Extractor.
-
-**Example Implementation**:
-
-```python
-from collections.abc import Iterator
-from typing import Any
-from nemo_curator.stages.text.download import DocumentIterator
-
-class CustomIterator(DocumentIterator):
-    def __init__(self, log_frequency: int = 1000):
-        super().__init__()
-        self._log_frequency = log_frequency
-    
-    def iterate(self, file_path: str) -> Iterator[dict[str, Any]]:
-        # Custom iteration logic to load local file and return documents
-        for record in load_local_file_fn(file_path):
-            yield {"content": record_content, "metadata": record_metadata}
-    
-    def output_columns(self) -> list[str]:
-        return ["content", "metadata"]
-```
-
-### DocumentExtractor (Optional)
-
-DocumentExtractor works on a Pandas DataFrame and is optional.
-
-**Example Implementation**:
-
-```python
-from typing import Any
-from nemo_curator.stages.text.download import DocumentExtractor
-
-class CustomExtractor(DocumentExtractor):
-    def __init__(self):
-        super().__init__()
-    
-    def extract(self, record: dict[str, str]) -> dict[str, Any] | None:
-        # Custom extraction logic
-        cleaned_text = clean_content(record["content"])
-        detected_lang = detect_language(cleaned_text)
-        return {"text": cleaned_text, "language": detected_lang}
-    
-    def input_columns(self):
-        return ["content", "metadata"]
-    
-    def output_columns(self):
-        return ["text", "language"]
-```
-
-## Supported Data Sources
-
-NeMo Curator provides built-in support for major public text datasets:
-
-::::{grid} 2 2 2 3
-:gutter: 2
-
-:::{grid-item-card} {octicon}`globe;1.5em;sd-mr-1` Common Crawl
-:link: text-load-data-common-crawl
-:link-type: ref
-
-Download and extract web archive data from Common Crawl
-+++
-{bdg-secondary}`web-scale` {bdg-secondary}`multilingual`
-:::
-
-:::{grid-item-card} {octicon}`typography;1.5em;sd-mr-1` ArXiv
-:link: text-load-data-arxiv
-:link-type: ref
-
-Download and extract scientific papers from arXiv
-+++
-{bdg-secondary}`academic` {bdg-secondary}`scientific`
-:::
-
-:::{grid-item-card} {octicon}`book;1.5em;sd-mr-1` Wikipedia
-:link: text-load-data-wikipedia
-:link-type: ref
-
-Download and extract Wikipedia articles from Wikipedia dumps
-+++
-{bdg-secondary}`encyclopedic` {bdg-secondary}`structured`
-:::
-
-:::{grid-item-card} {octicon}`gear;1.5em;sd-mr-1` Custom Data Sources
-:link: text-load-data-custom
-:link-type: ref
-
-Implement a download and extract pipeline for a custom data source
-+++
-{bdg-secondary}`extensible` {bdg-secondary}`specialized`
-:::
-
-::::
-
-## Integration with Pipeline Architecture
-
-The data acquisition process seamlessly integrates with NeMo Curator's pipeline-based architecture. The `DocumentDownloadExtractStage` handles parallel processing through the distributed computing framework.
-
-### Acquisition Workflow
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.download import DocumentDownloadExtractStage
-from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter
-from nemo_curator.stages.base import ProcessingStage
-
-# Create composite stage
-class CustomDownloadExtractStage(DocumentDownloadExtractStage):
-    def __init__(
-        self,
-        download_dir: str = "./custom_downloads",
-        url_limit: int | None = None,
-        record_limit: int | None = None,
-        add_filename_column: bool | str = True,
-    ):
-        # Create the URL generator
-        self.url_generator = CustomURLGenerator()
-
-        # Create the downloader
-        self.downloader = CustomDownloader(download_dir=download_dir)
-
-        # Create the iterator
-        self.iterator = CustomIterator()
-
-        # Create the extractor
-        self.extractor = CustomExtractor()
-
-        # Initialize the parent composite stage
-        super().__init__(
-            url_generator=self.url_generator,
-            downloader=self.downloader,
-            iterator=self.iterator,
-            extractor=self.extractor,
-            url_limit=url_limit,
-            record_limit=record_limit,
-            add_filename_column=add_filename_column,
-        )
-        self.name = "custom_pipeline"
-
-    def decompose(self) -> list[ProcessingStage]:
-        """Decompose this composite stage into its constituent stages."""
-        return self.stages
-
-    def get_description(self) -> str:
-        """Get a description of this composite stage."""
-        return "Custom pipeline"
-
-# Initialize Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Define acquisition pipeline
-pipeline = Pipeline(name="data_acquisition")
-
-# Create download and extract stage with custom components
-custom_download_extract_stage = CustomDownloadExtractStage(...)
-pipeline.add_stage(custom_download_extract_stage)
-
-# Write the results
-pipeline.add_stage(JsonlWriter(...))
-
-# Execute acquisition pipeline
-results = pipeline.run()
-
-# Stop Ray client
-ray_client.stop()
-```
-
-## Performance Optimization
-
-### Parallel Processing
-
-Data acquisition leverages distributed computing frameworks for scalable processing:
-
-- **Parallel Downloads**: Each URL in the generated list downloads through separate workers
-- **Concurrent Extraction**: Files process in parallel across workers
-- **Memory Management**: Streaming processing for large files
-
-## Integration with Data Loading
-
-Data acquisition produces a standardized output that integrates seamlessly with Curator's {ref}`Data Loading Concepts <about-concepts-text-data-loading>`:
-
-```{note}
-Data acquisition includes basic content-level deduplication during extraction (such as removing duplicate HTML content within individual web pages). This is separate from the main deduplication pipeline stages (exact, fuzzy, and semantic deduplication) that operate on the full dataset after acquisition.
-```
-
-```python
-from nemo_curator.stages.text.io.writer import ParquetWriter
-
-# Create acquisition pipeline with all stages including writer
-acquisition_pipeline = Pipeline(name="data_acquisition")
-# ... add acquisition stages ...
-
-# Add writer to save results directly
-writer = ParquetWriter(path="acquired_data/")
-acquisition_pipeline.add_stage(writer)
-
-# Run pipeline to acquire and save data in one execution
-results = acquisition_pipeline.run()
-
-# Later: Load using pipeline-based data loading
-from nemo_curator.stages.text.io.reader import ParquetReader
-
-load_pipeline = Pipeline(name="load_acquired_data")
-reader = ParquetReader(file_paths="acquired_data/")
-load_pipeline.add_stage(reader)
-```
-
-This enables you to:
-
-- **Separate acquisition from processing** for better workflow management
-- **Cache acquired data** to avoid re-downloading
-- **Mix acquired and local data** in the same processing pipeline
-- **Use standard loading patterns** regardless of data origin
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/about/concepts/text/data-loading-concepts.md
-```md
----
-description: "Core concepts for loading and managing text datasets using pipeline-based readers and DocumentBatch tasks"
-categories: ["concepts-architecture"]
-tags: ["data-loading", "document-dataset", "parallel-dataset", "distributed", "gpu-accelerated", "local-files"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "concept"
-modality: "text-only"
----
-
-(about-concepts-text-data-loading)=
-
-# Data Loading Concepts
-
-This guide covers the core concepts for loading and managing text data from local files in NVIDIA NeMo Curator.
-
-## Pipeline-Based Data Loading
-
-NeMo Curator uses a **pipeline-based architecture** for handling large-scale text data processing. Data flows through processing stages that transform data, enabling distributed processing of local files.
-
-The system provides two primary readers for text data:
-
-- **JsonlReader** - For JSON Lines format files (most common).
-- **ParquetReader** - For columnar Parquet files (better performance for large datasets with PyArrow optimization).
-
-Both readers support optimization through:
-
-- **Field selection** - Reading specified columns to reduce memory usage.
-- **Partitioning control** - Using `blocksize` or `files_per_partition` to optimize `DocumentBatch` sizes during distributed processing.
-- **Recommended block size** - Use ~128MB for optimal object store performance with smaller data chunks.
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader, ParquetReader
-
-# Initialize Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Basic usage with optimization
-pipeline = Pipeline(name="data_processing")
-
-# Define file type (example)
-file_type = "jsonl"  # or "parquet" based on your data
-
-if file_type == "jsonl":
-    # JSONL reader with field selection and partitioning
-    jsonl_reader = JsonlReader(
-        file_paths="/path/to/jsonl_directory",
-        blocksize="128MB",  # Recommended for object store optimization
-        fields=["text", "id"]  # Column selection for efficiency
-    )
-    pipeline.add_stage(jsonl_reader)
-else:
-    # Parquet reader with performance optimization
-    parquet_reader = ParquetReader(
-        file_paths="/path/to/parquet_directory",
-        files_per_partition=4,  # Alternative to blocksize
-        fields=["text", "metadata"]
-    )
-    pipeline.add_stage(parquet_reader)
-
-# Execute pipeline
-results = pipeline.run()
-
-# Stop Ray client
-ray_client.stop()
-```
-
-## Optimization Strategies
-
-### Partitioning Control
-
-:::{note}
-**Partitioning Strategy**: Specify either `files_per_partition` or `blocksize`. If `files_per_partition` is provided, `blocksize` is ignored.
-:::
-
-```python
-# Option 1: Size-based partitioning (recommended)
-reader = JsonlReader(
-    file_paths="/path/to/data",
-    blocksize="128MB"  # Optimal for object store performance
-)
-
-# Option 2: File count-based partitioning  
-reader = JsonlReader(
-    file_paths="/path/to/data",
-    files_per_partition=16  # Match your cluster size
-)
-```
-
-### Performance Recommendations
-
-- **Block size and files per partition**: Use ~128MB for optimal performance. Very large batches lead to memory overhead when passing data between stages through the object store, while very small batches induce overhead from processing many more tasks. We recommend ~128MB as a good balance. Try to avoid going below 32MB or above 1GiB partition sizes.
-- **Field selection**: Specify the `fields` parameter to read only the required columns.
-- **Engine choice**: ParquetReader defaults to PyArrow with `dtype_backend="pyarrow"` for optimal performance and memory efficiency. If you encounter compatibility issues with certain data types or schemas, you can override these defaults through `read_kwargs`:
-  ```python
-  # Remove PyArrow dtype backend if compatibility issues arise
-  reader = ParquetReader(
-      file_paths="data.parquet",
-      read_kwargs={"dtype_backend": "numpy_nullable"}  # Falls back to Pandas default behavior
-  )
-  ```
-
-### Memory Tips
-
-:::{warning}
-If you set the `blocksize` parameter to a size smaller than your input file size(s), Curator does not split the input files and instead attempts to read each file in full. To avoid out-of-memory issues, use the helper script described below.
-:::
-
-If any of your individual JSONL or Parquet files are greater than 2 GiB, we recommend using the `nemo_curator/utils/split_large_files.py` helper script to split them into more manageable sizes and prevent out-of-memory issues. You can run it with:
-
-```bash
-python nemo_curator/utils/split_large_files.py --input-path "/path/to/input/dir" --file-type "parquet" --output-path "/path/to/output/dir" --target-size-mb 128
-```
-
-It supports splitting JSONL or Parquet files as specified by the `--file-type` argument.
-
-Another option is running file splitting within your existing script. For example, you can split large JSONL files with:
-
-```python
-import ray
-from nemo_curator.core.client import RayClient
-from nemo_curator.utils.split_large_files import split_jsonl_file_by_size
-
-# Start Ray client as usual
-ray_client = RayClient()
-ray_client.start()
-
-input_files = []  # your list of input jsonl files
-
-ray.get(
-  [
-    split_jsonl_file_by_size.remote(
-      input_file=f,
-      output_path="/path/to/output/dir",
-      target_size_mb=128,
-    )
-    for f in input_files
-  ]
-)
-
-# initialize your Curator pipeline with JsonlReader, etc.
-```
-
-Similarly for Parquet files:
-
-```python
-import ray
-from nemo_curator.core.client import RayClient
-from nemo_curator.utils.split_large_files import split_parquet_file_by_size
-
-# Start Ray client as usual
-ray_client = RayClient()
-ray_client.start()
-
-input_files = []  # your list of input parquet files
-
-ray.get(
-  [
-    split_parquet_file_by_size.remote(
-      input_file=f,
-      output_path="/path/to/output/dir",
-      target_size_mb=128,
-    )
-    for f in input_files
-  ]
-)
-
-# initialize your Curator pipeline with ParquetReader, etc.
-```
-
-## Data Export Options
-
-NeMo Curator provides flexible export options for processed datasets:
-
-```python
-from nemo_curator.stages.text.io.writer import JsonlWriter, ParquetWriter
-
-# Add writers to pipeline after processing stages
-pipeline.add_stage(JsonlWriter(path="output_directory/"))
-# or
-pipeline.add_stage(ParquetWriter(path="output_directory/"))
-```
-
-## Common Loading Patterns
-
-### Multi-Source Data
-
-```python
-# Combine multiple directories with same reader type
-reader = JsonlReader(file_paths=[
-    "dataset_v1/",
-    "dataset_v2/", 
-    "additional_data/"
-])
-```
-
-:::{note}
-You cannot combine different reader types (`JsonlReader` + `ParquetReader`)  in the same pipeline stage. For different file types, you would need to create a new `CustomReader` from the underlying `BaseReader` that can read based on different extensions provided.
-:::
-
-## Remote Data Sources
-
-This page focuses on loading text data from **local files** using `JsonlReader` and `ParquetReader`. Both readers support remote storage locations (Amazon S3, Azure) when you provide remote file paths.
-
-For downloading and processing data from **remote sources** like ArXiv, Common Crawl, and Wikipedia, refer to the {ref}`Data Acquisition Concepts <about-concepts-text-data-acquisition>` page which covers:
-
-- **URLGenerator, DocumentDownloader, DocumentIterator, DocumentExtractor** components.
-- **Built-in support** for Common Crawl, ArXiv, Wikipedia, and custom sources.
-- **Integration patterns** with pipeline-based processing.
-- **Configuration and scaling** strategies.
-
-The data acquisition process produces standardized output that integrates seamlessly with the pipeline-based loading concepts described on this page.
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/index.md
-```md
----
-description: "Load text data from Common Crawl, Wikipedia, and custom datasets using Curator."
-categories: ["workflows"]
-tags: ["data-loading", "arxiv", "common-crawl", "wikipedia", "custom-data", "distributed", "ray"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "workflow"
-modality: "text-only"
----
-
-(text-load-data)=
-
-# Download Data
-
-Load text data from ArXiv, Common Crawl, Wikipedia, and custom sources using Curator.
-
-Curator provides a task-centric pipeline for downloading and processing large-scale public text datasets. It runs on Ray and converts raw formats like Common Crawl's `.warc.gz` into JSONL.
-
-## How it Works
-
-Curator uses a {ref}`4-step pipeline pattern <about-concepts-text-data-acquisition>` where data flows through stages as tasks. Each step uses a `ProcessingStage` that transforms tasks according to Curator's {ref}`pipeline-based architecture <about-concepts-text-data-loading>`.
-
-Data sources provide composite stages that combine these steps into complete download-and-extract pipelines, producing `DocumentBatch` tasks for further processing.
-
-::::{tab-set}
-
-:::{tab-item} Python
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.download import CommonCrawlDownloadExtractStage
-from nemo_curator.stages.text.io.writer import JsonlWriter
-
-# Initialize Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Create a pipeline for downloading Common Crawl data
-pipeline = Pipeline(
-    name="common_crawl_download",
-    description="Download and process Common Crawl web archives"
-)
-
-# Add data loading stage
-cc_stage = CommonCrawlDownloadExtractStage(
-    start_snapshot="2020-50",
-    end_snapshot="2020-50",
-    download_dir="/tmp/cc_downloads",
-    crawl_type="main",
-    url_limit=10  # Limit for testing
-)
-pipeline.add_stage(cc_stage)
-
-# Add writer stage to save as JSONL
-writer = JsonlWriter(path="/output/folder")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()
-
-# Stop Ray client
-ray_client.stop()
-```
-
-:::
-
-::::
-
----
-
-## Data Sources & File Formats
-
-Load data from public datasets and custom data sources using Curator stages.
-
-::::{grid} 1 1 1 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`file;1.5em;sd-mr-1` Read Existing Data
-:link: text-load-data-read-existing
-:link-type: ref
-Read existing JSONL and Parquet datasets using Curator's reader stages
-+++
-{bdg-secondary}`jsonl`
-{bdg-secondary}`parquet`
-:::
-
-:::{grid-item-card} {octicon}`download;1.5em;sd-mr-1` Common Crawl
-:link: text-load-data-common-crawl
-:link-type: ref
-Download and extract web archive data from Common Crawl
-+++
-{bdg-secondary}`web-data`
-{bdg-secondary}`warc`
-{bdg-secondary}`html-extraction`
-:::
-
-:::{grid-item-card} {octicon}`download;1.5em;sd-mr-1` Wikipedia
-:link: text-load-data-wikipedia
-:link-type: ref
-Download and extract Wikipedia articles from Wikipedia dumps
-+++
-{bdg-secondary}`articles`
-{bdg-secondary}`multilingual`
-{bdg-secondary}`xml-dumps`
-:::
-
-:::{grid-item-card} {octicon}`download;1.5em;sd-mr-1` Custom Data Sources
-:link: text-load-data-custom
-:link-type: ref
-Implement a download and extract pipeline for a custom data source
-+++
-{bdg-secondary}`jsonl`
-{bdg-secondary}`parquet`
-{bdg-secondary}`file-partitioning`
-:::
-
-::::
-
-```{toctree}
-:maxdepth: 4
-:titlesonly:
-:hidden:
-
-Read Existing Data <read-existing>
-arxiv
-common-crawl
-wikipedia
-Custom Data Sources <custom.md>
-```
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/common-crawl.md
-```md
----
-description: "Download and extract text from Common Crawl web archives using Curator."
-categories: ["how-to-guides"]
-tags: ["common-crawl", "web-data", "warc", "language-detection", "distributed", "html-extraction", "pipeline"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "how-to"
-modality: "text-only"
----
-
-(text-load-data-common-crawl)=
-
-# Common Crawl
-
-Download and extract text from Common Crawl snapshots using Curator.
-
-Common Crawl provides petabytes of web data collected over years of web crawling. The data uses a compressed web archive format (`.warc.gz`), which requires processing to extract useful text for language model training.
-
-## How it Works
-
-Curator's Common Crawl processing pipeline consists of four sequential stages:
-
-1. **URL Generation**: Generates WARC file URLs from Common Crawl's index for the specified snapshot range
-2. **Download**: Downloads the compressed WARC files from Common Crawl's servers (optionally using S3 for faster downloads)
-3. **Iteration**: Extracts individual records from WARC files and decodes HTML content
-4. **Extraction**: Performs language detection and extracts clean text using configurable HTML extraction algorithms
-
-The pipeline outputs structured data that you can write to JSONL or Parquet files for further processing.
-
-## Before You Start
-
-Choose your download method and ensure you have the prerequisites:
-
-- HTTPS downloads (default): No AWS account required.
-- S3 downloads (set `use_aws_to_download=True`):
-  - An AWS account with credentials configured (profile, environment, or instance role).
-  - Common Crawl's S3 access uses Requester Pays; you incur charges for requests and data transfer.
-  - `s5cmd` installed for fast S3 listing and copy operations:
-
-```bash
-# Install s5cmd for faster S3 downloads
-pip install s5cmd
-```
-
----
-
-## Usage
-
-Here's how to create and run a Common Crawl processing pipeline:
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.download import CommonCrawlDownloadExtractStage
-from nemo_curator.stages.text.io.writer import JsonlWriter
-
-def main():
-    # Initialize Ray client
-    ray_client = RayClient()
-    ray_client.start()
-
-    # Create pipeline
-    pipeline = Pipeline(
-        name="common_crawl_pipeline",
-        description="Download and process Common Crawl data"
-    )
-
-    # Add Common Crawl processing stage
-    cc_stage = CommonCrawlDownloadExtractStage(
-        start_snapshot="2020-50",  # YYYY-WW format for CC-MAIN
-        end_snapshot="2020-50",
-        download_dir="./cc_downloads",
-        crawl_type="main",  # or "news"
-        use_aws_to_download=True,  # Faster S3 downloads (requires s5cmd)
-        url_limit=10,  # Limit number of WARC files for testing
-        record_limit=1000,  # Limit records per WARC file
-    )
-    pipeline.add_stage(cc_stage)
-
-    # Add output writer stage
-    writer = JsonlWriter("./cc_output")
-    pipeline.add_stage(writer)
-
-    # Run pipeline
-    results = pipeline.run()
-
-    # Stop Ray client
-    ray_client.stop()
-
-if __name__ == "__main__":
-    main()
-```
-
-For executor options and configuration, refer to {ref}`reference-execution-backends`.
-
-### Writing to Parquet
-
-To write to Parquet files instead of JSONL, use `ParquetWriter`:
-
-```python
-from nemo_curator.stages.text.io.writer import ParquetWriter
-
-# Replace the JSONL writer with ParquetWriter
-writer = ParquetWriter("./cc_output_parquet")
-pipeline.add_stage(writer)
-```
-
-### Parameters
-
-```{list-table} CommonCrawlDownloadExtractStage Parameters
-:header-rows: 1
-:widths: 25 20 35 20
-
-* - Parameter
-  - Type
-  - Description
-  - Default
-* - `start_snapshot`
-  - str
-  - First snapshot to include (format: "YYYY-WW" for main, "YYYY-MM" for news). Not every year and week has a snapshot; refer to the official list at [https://data.commoncrawl.org/](https://data.commoncrawl.org/).
-  - Required
-* - `end_snapshot`
-  - str
-  - Last snapshot to include (same format as `start_snapshot`). Ensure your range includes at least one valid snapshot.
-  - Required
-* - `download_dir`
-  - str
-  - Directory to store downloaded WARC files
-  - Required
-* - `crawl_type`
-  - Literal["main", "news"]
-  - Whether to use CC-MAIN or CC-NEWS dataset
-  - "main"
-* - `html_extraction`
-  - HTMLExtractorAlgorithm | str | None
-  - Text extraction algorithm to use. Defaults to `JusTextExtractor()` if not specified.
-  - JusTextExtractor() if not specified
-* - `html_extraction_kwargs`
-  - dict | None
-  - Additional arguments for the HTML extractor. Ignored when `html_extraction` is a concrete extractor object (for example, `JusTextExtractor()`); pass kwargs to the extractor constructor instead. When `html_extraction` is a string ("justext", "resiliparse", or "trafilatura"), kwargs are forwarded.
-  - None
-* - `stop_lists`
-  - dict[str, frozenset[str]] | None
-  - Language-specific stop words for text quality assessment. If not provided, Curator uses jusText defaults with additional support for Thai, Chinese, and Japanese languages.
-  - None
-* - `use_aws_to_download`
-  - bool
-  - Use S3 downloads via s5cmd instead of HTTPS (requires s5cmd installation)
-  - False
-* - `verbose`
-  - bool
-  - Enable verbose logging for download operations
-  - False
-* - `url_limit`
-  - int | None
-  - Maximum number of WARC files to download (useful for testing)
-  - None
-* - `record_limit`
-  - int | None
-  - Maximum number of records to extract per WARC file
-  - None
-* - `add_filename_column`
-  - bool | str
-  - Whether to add source filename column to output; if str, uses it as the column name (default name: "file_name")
-  - True
-```
-
-## Output Format
-
-The pipeline processes Common Crawl data through several stages, ultimately producing structured documents. The extracted text includes the following fields:
-
-```json
-{
-  "url": "http://example.com/page.html",
-  "warc_id": "a515a7b6-b6ec-4bed-998b-8be2f86f8eac", 
-  "source_id": "CC-MAIN-20201123153826-20201123183826-00000.warc.gz",
-  "language": "ENGLISH",
-  "text": "Extracted web page content..."
-}
-```
-
-```{list-table} Output Fields
-:header-rows: 1
-:widths: 20 80
-
-* - Field
-  - Description
-* - `url`
-  - Original URL of the web page
-* - `warc_id`
-  - Unique identifier for the WARC record
-* - `source_id`
-  - Name of the source WARC file
-* - `language`
-  - Detected language of the content (e.g., "ENGLISH", "SPANISH")
-* - `text`
-  - Extracted and cleaned text content
-```
-
-If you enable `add_filename_column`, the output includes an extra field `file_name` (or your custom column name).
-
-## Customization Options
-
-### HTML Text Extraction Algorithms
-
-Curator supports several HTML text extraction algorithms:
-
-```{list-table} Available HTML Extractors
-:header-rows: 1
-:widths: 30 70
-
-* - Extractor
-  - Library
-* - `JusTextExtractor`
-  - [jusText](https://github.com/miso-belica/jusText)
-* - `ResiliparseExtractor`
-  - [Resiliparse](https://github.com/chatnoir-eu/chatnoir-resiliparse)
-* - `TrafilaturaExtractor`
-  - [Trafilatura](https://trafilatura.readthedocs.io/)
-```
-
-#### Configuring HTML Extractors
-
-```python
-from nemo_curator.stages.text.download.html_extractors import ResiliparseExtractor
-from nemo_curator.stages.text.download.html_extractors import TrafilaturaExtractor
-
-# Use Resiliparse for extraction
-cc_stage = CommonCrawlDownloadExtractStage(
-    start_snapshot="2020-50",
-    end_snapshot="2020-50",
-    download_dir="./downloads",
-    html_extraction=ResiliparseExtractor(
-        required_stopword_density=0.25,
-        main_content=True
-    )
-)
-
-# Or use Trafilatura with custom parameters
-cc_stage = CommonCrawlDownloadExtractStage(
-    start_snapshot="2020-50", 
-    end_snapshot="2020-50",
-    download_dir="./downloads",
-    html_extraction=TrafilaturaExtractor(
-        min_extracted_size=200,
-        max_repetitions=3
-    )
-)
-```
-
-### Language Processing
-
-You can customize language detection and extraction by providing stop words for different languages:
-
-```python
-# Define custom stop words for specific languages
-stop_lists = {
-    "ENGLISH": frozenset(["the", "and", "is", "in", "for", "where", "when", "to", "at"]),
-    "SPANISH": frozenset(["el", "la", "de", "que", "y", "en", "un", "es", "se", "no"])
-}
-
-cc_stage = CommonCrawlDownloadExtractStage(
-    start_snapshot="2020-50",
-    end_snapshot="2020-50", 
-    download_dir="./downloads",
-    stop_lists=stop_lists
-)
-```
-
-## Advanced Usage
-
-### Processing CC-NEWS Data
-
-For Common Crawl News data, use the `news` crawl type with month-based snapshots:
-
-```python
-cc_stage = CommonCrawlDownloadExtractStage(
-    start_snapshot="2020-08",  # YYYY-MM format for CC-NEWS
-    end_snapshot="2020-10",
-    download_dir="./news_downloads",
-    crawl_type="news"  # Use CC-NEWS instead of CC-MAIN
-)
-```
-
-See [https://data.commoncrawl.org/crawl-data/CC-NEWS/index.html](https://data.commoncrawl.org/crawl-data/CC-NEWS/index.html) for more information.
-
-### Large-Scale Processing
-
-For production workloads, consider these optimizations:
-
-```python
-cc_stage = CommonCrawlDownloadExtractStage(
-    start_snapshot="2020-50",
-    end_snapshot="2020-50", 
-    download_dir="/fast_storage/cc_downloads",
-    use_aws_to_download=True,  # Faster S3 downloads
-    verbose=False,  # Reduce logging overhead
-    # Remove limits for full processing
-    # url_limit=None,
-    # record_limit=None
-)
-```
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/custom.md
-```md
----
-description: "Create custom data loading pipelines using Curator."
-categories: ["how-to-guides"]
-tags: ["custom-data", "stages", "pipelines", "data-loading"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "advanced"
-content_type: "how-to"
-modality: "text-only"
----
-
-(text-load-data-custom)=
-
-# Custom Data Loading
-
-Create custom data loading pipelines using Curator. This guide shows how to build modular stages that run on Curator's distributed processing.
-
-## How It Works
-
-Curator uses the same **3-step pipeline pattern** described in {ref}`Data Acquisition Concepts <about-concepts-text-data-acquisition>` for custom data loading. Each step uses an abstract base class with corresponding processing stages that compose into pipelines.
-
----
-
-## Architecture Overview
-
-For detailed information about the core components and data flow, see {ref}`Data Acquisition Concepts <about-concepts-text-data-acquisition>` and {ref}`Data Loading Concepts <about-concepts-text-data-loading>`.
-
----
-
-## Implementation Guide
-
-### 1. Create Directory Structure
-
-```text
-your_data_source/
-├── __init__.py
-├── stage.py           # Main composite stage
-├── url_generation.py  # URL generation logic
-├── download.py        # Download implementation
-├── iterator.py        # File iteration logic
-└── extract.py         # Data extraction logic (optional)
-```
-
-### 2. Build Core Components
-
-#### URL Generator (`url_generation.py`)
-
-```python
-from dataclasses import dataclass
-from nemo_curator.stages.text.download import URLGenerator
-
-@dataclass
-class CustomURLGenerator(URLGenerator):
-    def generate_urls(self) -> list[str]:
-        """Generate list of URLs to download."""
-        # Your URL generation logic here
-        return [
-            "https://example.com/dataset1.zip",
-            "https://example.com/dataset2.zip",
-        ]
-```
-
-#### Document Downloader (`download.py`)
-
-```python
-from nemo_curator.stages.text.download import DocumentDownloader
-
-class CustomDownloader(DocumentDownloader):
-    def __init__(self, download_dir: str):
-        super().__init__(download_dir=download_dir)
-
-    def _get_output_filename(self, url: str) -> str:
-        """Extract filename from URL."""
-        return url.split("/")[-1]
-
-    def _download_to_path(self, url: str, path: str) -> tuple[bool, str | None]:
-        """Download file from URL to local path."""
-        # Custom download logic
-        # Return (success_bool, error_message)
-        try:
-            # ... download implementation ...
-            return True, None
-        except Exception as e:
-            return False, str(e)
-```
-
-#### Document Iterator (`iterator.py`)
-
-```python
-import json
-from collections.abc import Iterator
-from typing import Any
-from nemo_curator.stages.text.download import DocumentIterator
-
-class CustomIterator(DocumentIterator):
-    def __init__(self, log_frequency: int = 1000):
-        super().__init__()
-        self._log_frequency = log_frequency
-
-    def iterate(self, file_path: str) -> Iterator[dict[str, Any]]:
-        """Iterate over records in a file."""
-        # Custom iteration logic to load local file and return documents
-        for record in load_local_file_fn(file_path):
-            yield {"content": record_content, "metadata": record_metadata, "id": record_id}
-
-    def output_columns(self) -> list[str]:
-        """Define output columns."""
-        return ["content", "metadata", "id"]
-```
-
-#### Document Extractor (`extract.py`)
-
-```python
-from typing import Any
-from nemo_curator.stages.text.download import DocumentExtractor
-
-class CustomExtractor(DocumentExtractor):
-    def __init__(self):
-        super().__init__()
-
-    def extract(self, record: dict[str, str]) -> dict[str, Any] | None:
-        """Transform raw record to final format."""
-        # Skip invalid records
-        if not record.get("content"):
-            return None
-
-        # Extract and clean text
-        cleaned_text = self._clean_text(record["content"])
-
-        # Generate unique ID if not present
-        doc_id = record.get("id", self._generate_id(cleaned_text))
-
-        return {
-            "text": cleaned_text,
-            "id": doc_id,
-            "source": record.get("metadata", {}).get("source", "unknown")
-        }
-
-    def input_columns(self) -> list[str]:
-        return ["content", "metadata", "id"]
-
-    def output_columns(self) -> list[str]:
-        return ["text", "id", "source"]
-
-    def _clean_text(self, text: str) -> str:
-        """Clean and normalize text."""
-        # Your text cleaning logic here
-        return text.strip()
-
-    def _generate_id(self, text: str) -> str:
-        """Generate unique ID for text."""
-        import hashlib
-        return hashlib.md5(text.encode()).hexdigest()[:16]
-```
-
-### 3. Create Composite Stage (`stage.py`)
-
-```python
-from nemo_curator.stages.text.download import DocumentDownloadExtractStage
-from nemo_curator.stages.base import ProcessingStage
-from .url_generation import CustomURLGenerator
-from .download import CustomDownloader
-from .iterator import CustomIterator
-from .extract import CustomExtractor
-
-class CustomDataStage(DocumentDownloadExtractStage):
-    """Custom data loading stage combining all components."""
-
-    def __init__(
-        self,
-        download_dir: str = "./custom_downloads",
-        url_limit: int | None = None,
-        record_limit: int | None = None,
-        add_filename_column: bool | str = True,
-    ):
-        self.url_generator = CustomURLGenerator()
-        self.downloader = CustomDownloader(download_dir=download_dir)
-        self.iterator = CustomIterator()
-        self.extractor = CustomExtractor()
-
-        # Initialize the parent composite stage
-        super().__init__(
-            url_generator=self.url_generator,
-            downloader=self.downloader,
-            iterator=self.iterator,
-            extractor=self.extractor,  # Optional - remove if not needed
-            url_limit=url_limit,
-            record_limit=record_limit,
-            add_filename_column=add_filename_column,
-        )
-        self.name = "custom_data"
-
-    def decompose(self) -> list[ProcessingStage]:
-        """Decompose this composite stage into its constituent stages."""
-        return self.stages
-
-    def get_description(self) -> str:
-        """Get a description of this composite stage."""
-        return "Custom data"
-```
-
----
-
-## Usage Examples
-
-### Basic Pipeline
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from your_data_source.stage import CustomDataStage
-from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter
-
-def main():
-    # Initialize Ray client
-    ray_client = RayClient()
-    ray_client.start()
-
-    # Create pipeline
-    pipeline = Pipeline(
-        name="custom_data_pipeline",
-        description="Load and process custom dataset"
-    )
-
-    # Create custom data loading stage
-    data_stage = CustomDataStage(...)
-
-    pipeline.add_stage(data_stage)
-
-    # Save the results to JSONL
-    pipeline.add_stage(JsonlWriter(...))
-
-    # Run pipeline
-    print("Starting pipeline...")
-    results = pipeline.run()
-
-    # Stop Ray client
-    ray_client.stop()
-
-if __name__ == "__main__":
-    main()
-```
-
-For executor options and configuration, refer to {ref}`reference-execution-backends`.
-
----
-
-## Parameters Reference
-
-```{list-table} Custom Data Loading Parameters
-:header-rows: 1
-:widths: 20 20 40 20
-
-* - Parameter
-  - Type
-  - Description
-  - Default
-* - `url_generator`
-  - URLGenerator
-  - Custom URL generation implementation
-  - Required
-* - `downloader`
-  - DocumentDownloader
-  - Custom download implementation
-  - Required
-* - `iterator`
-  - DocumentIterator
-  - Custom file iteration implementation
-  - Required
-* - `extractor`
-  - DocumentExtractor | None
-  - Optional extraction/transformation step
-  - None
-* - `url_limit`
-  - int | None
-  - Maximum number of URLs to process
-  - None
-* - `record_limit`
-  - int | None
-  - Maximum records per file
-  - None
-* - `add_filename_column`
-  - bool | str
-  - Add filename column to output; if str, uses it as the column name (default name: "file_name")
-  - True
-```
-
----
-
-## Output Format
-
-Processed data flows through the pipeline as `DocumentBatch` tasks containing Pandas DataFrames or PyArrow Tables:
-
-### Example Output Schema
-
-```python
-{
-    "text": "This is the processed document text",
-    "id": "unique-document-id",
-    "source": "example.com",
-    "file_name": "dataset1.jsonl"  # If add_filename_column=True (default column name)
-}
-```
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/load-data/read-existing.md
-```md
----
-description: "Read existing JSONL and Parquet datasets using Curator's reader stages."
-categories: ["how-to-guides"]
-tags: ["jsonl", "parquet", "data-loading", "reader", "pipelines"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "beginner"
-content_type: "how-to"
-modality: "text-only"
----
-
-(text-load-data-read-existing)=
-
-# Read Existing Data
-
-Use Curator's `JsonlReader` and `ParquetReader` to read existing datasets into a pipeline, then optionally add processing stages.
-
-::::{tab-set}
-
-:::{tab-item} JSONL Reader
-:sync: jsonl
-
-## Example: Read JSONL and Filter
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.filters import ScoreFilter
-from nemo_curator.stages.text.filters.heuristic import WordCountFilter
-
-# Initialize Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Create pipeline for processing existing JSONL files
-pipeline = Pipeline(name="jsonl_data_processing")
-
-# Read JSONL files
-reader = JsonlReader(
-    file_paths="/path/to/data",
-    files_per_partition=4,
-    fields=["text", "url"]  # Only read specific columns
-)
-pipeline.add_stage(reader)
-
-# Add filtering stage
-word_filter = ScoreFilter(
-    filter_obj=WordCountFilter(min_words=50, max_words=1000),
-    text_field="text"
-)
-pipeline.add_stage(word_filter)
-
-# Add more stages to pipeline...
-
-# Execute pipeline
-results = pipeline.run()
-
-# Stop Ray client
-ray_client.stop()
-```
-
-:::
-
-:::{tab-item} Parquet Reader
-:sync: parquet
-
-## Example: Read Parquet and Filter
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import ParquetReader
-from nemo_curator.stages.text.filters import ScoreFilter
-from nemo_curator.stages.text.filters.heuristic import WordCountFilter
-
-# Initialize Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Create pipeline for processing existing Parquet files
-pipeline = Pipeline(name="parquet_data_processing")
-
-# Read Parquet files with PyArrow engine
-reader = ParquetReader(
-    file_paths="/path/to/data",
-    files_per_partition=4,
-    fields=["text", "metadata"]  # Only read specific columns
-)
-pipeline.add_stage(reader)
-
-# Add filtering stage
-word_filter = ScoreFilter(
-    filter_obj=WordCountFilter(min_words=50, max_words=1000),
-    text_field="text"
-)
-pipeline.add_stage(word_filter)
-
-# Add more stages to pipeline...
-
-# Execute pipeline
-results = pipeline.run()
-
-# Stop Ray client
-ray_client.stop()
-```
-
-:::
-
-::::
-
-## Reader Configuration
-
-### Common Parameters
-
-Both `JsonlReader` and `ParquetReader` support these configuration options:
-
-```{list-table}
-:header-rows: 1
-:widths: 20 20 40 20
-
-* - Parameter
-  - Type
-  - Description
-  - Default
-* - `file_paths`
-  - str | list[str]
-  - File paths or glob patterns to read
-  - Required
-* - `files_per_partition`
-  - int | None
-  - Number of files per partition. Overrides `blocksize` if both are provided.
-  - None
-* - `blocksize`
-  - int | str | None
-  - Target partition size (e.g., "128MB"). Ignored if `files_per_partition` is provided.
-  - None
-* - `fields`
-  - list[str] | None
-  - Column names to read (column selection)
-  - None (all columns)
-* - `read_kwargs`
-  - dict[str, Any] | None
-  - Extra arguments for the underlying reader
-  - None
-```
-
-### Parquet-Specific Features
-
-`ParquetReader` provides these optimizations:
-
-- **PyArrow Engine**: Uses `pyarrow` engine by default for better performance.
-- **Storage Options**: Supports cloud storage through `storage_options` in `read_kwargs`.
-- **Schema Handling**: Automatic schema inference and validation.
-- **Columnar Efficiency**: Optimized for reading specific columns.
-
-### Performance Tips
-
-- Use the `fields` parameter to read only the required columns for better performance.
-- Set `files_per_partition` based on your cluster size and memory constraints.
-- Use the `blocksize` parameter for fine-grained control over partition sizes.
-
-### Memory Tips
-
-:::{warning}
-If you set the `blocksize` parameter to a size smaller than your input file size(s), Curator does not split the input files and instead attempts to read each file in full. To avoid out-of-memory issues, use the helper script described below.
-:::
-
-If any of your individual JSONL or Parquet files are greater than 2 GiB, we recommend using the `nemo_curator/utils/split_large_files.py` helper script to split them into more manageable sizes and prevent out-of-memory issues. You can run it with:
-
-```bash
-python nemo_curator/utils/split_large_files.py --input-path "/path/to/input/dir" --file-type "parquet" --output-path "/path/to/output/dir" --target-size-mb 128
-```
-
-It supports splitting JSONL or Parquet files as specified by the `--file-type` argument.
-
-Another option is running file splitting within your existing script. For example, you can split large JSONL files with:
-
-```python
-import ray
-from nemo_curator.core.client import RayClient
-from nemo_curator.utils.split_large_files import split_jsonl_file_by_size
-
-# Start Ray client as usual
-ray_client = RayClient()
-ray_client.start()
-
-input_files = []  # your list of input jsonl files
-
-ray.get(
-  [
-    split_jsonl_file_by_size.remote(
-      input_file=f,
-      output_path="/path/to/output/dir",
-      target_size_mb=128,
-    )
-    for f in input_files
-  ]
-)
-
-# initialize your Curator pipeline with JsonlReader, etc.
-```
-
-Similarly for Parquet files:
-
-```python
-import ray
-from nemo_curator.core.client import RayClient
-from nemo_curator.utils.split_large_files import split_parquet_file_by_size
-
-# Start Ray client as usual
-ray_client = RayClient()
-ray_client.start()
-
-input_files = []  # your list of input parquet files
-
-ray.get(
-  [
-    split_parquet_file_by_size.remote(
-      input_file=f,
-      output_path="/path/to/output/dir",
-      target_size_mb=128,
-    )
-    for f in input_files
-  ]
-)
-
-# initialize your Curator pipeline with ParquetReader, etc.
-```
-
-## Output Integration
-
-Both readers produce `DocumentBatch` tasks that integrate seamlessly with:
-
-- **Processing Stages**: Apply filters, transformations, and quality checks.
-- **Writer Stages**: Export to JSONL, Parquet, or other formats.
-- **Analysis Tools**: Convert to Pandas/PyArrow for inspection and debugging.
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/base/download.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import Any
-
-from loguru import logger
-
-from nemo_curator.stages.base import ProcessingStage
-from nemo_curator.stages.resources import Resources
-from nemo_curator.tasks import FileGroupTask
-
-
-class DocumentDownloader(ABC):
-    """Abstract base class for document downloaders."""
-
-    def __init__(self, download_dir: str, verbose: bool = False):
-        """Initialize the downloader.
-
-        Args:
-            download_dir: Directory to store downloaded files
-            verbose: If True, logs detailed download information
-        """
-        self._download_dir = download_dir
-        self._verbose = verbose
-        os.makedirs(download_dir, exist_ok=True)
-
-    @abstractmethod
-    def _get_output_filename(self, url: str) -> str:
-        """Generate output filename from URL.
-
-        Args:
-            url: URL to download
-
-        Returns:
-            Output filename (without directory path)
-        """
-        ...
-
-    @abstractmethod
-    def _download_to_path(self, url: str, path: str) -> tuple[bool, str | None]:
-        """Download URL to specified path.
-
-        Args:
-            url: URL to download
-            path: Local path to save file
-
-        Returns:
-            Tuple of (success, error_message). If success is True, error_message should be None.
-            If success is False, error_message should contain the error details.
-        """
-        ...
-
-    def download(self, url: str) -> str | None:
-        """Download a document from URL with temporary file handling.
-
-        Downloads file to temporary location then atomically moves to final path.
-        Checks for existing file to avoid re-downloading. Supports resumable downloads.
-        Args:
-            url: URL to download
-
-        Returns:
-            Path to downloaded file, or None if download failed
-        """
-        # Generate output filename
-        output_name = self._get_output_filename(url)
-        output_file = os.path.join(self._download_dir, output_name)
-        temp_file = output_file + ".tmp"
-
-        # If final file exists and is non-empty, assume it's complete
-        if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
-            if self._verbose:
-                logger.info(f"File: {output_file} exists. Not downloading")
-            return output_file
-
-        # Download to temporary file
-        success, error_message = self._download_to_path(url, temp_file)
-
-        if success:
-            # Download successful, atomically move temp file to final location
-            os.rename(temp_file, output_file)
-            if self._verbose:
-                file_size = os.path.getsize(output_file)
-                logger.info(f"Successfully downloaded to {output_file} ({file_size} bytes)")
-            return output_file
-        else:
-            # Download failed
-            logger.error(f"Failed to download to {output_file}: {error_message}")
-            return None
-
-    def num_workers_per_node(self) -> int | None:
-        """Number of workers per node for Downloading. This is sometimes needed to ensure we are not overloading the network.
-
-        Returns:
-            Number of workers per node, or None if there is no limit and we can download as fast as possible
-        """
-        return None
-
-
-@dataclass
-class DocumentDownloadStage(ProcessingStage[FileGroupTask, FileGroupTask]):
-    """Stage that downloads files from URLs to local storage.
-
-    Takes a FileGroupTask with URLs and returns a FileGroupTask with local file paths.
-    This allows the download step to scale independently from iteration/extraction.
-    """
-
-    resources = Resources(cpus=0.5)
-    downloader: DocumentDownloader
-    batch_size = None
-
-    def __post_init__(self):
-        self.name = f"download_{self.downloader.__class__.__name__.lower()}"
-
-    def inputs(self) -> tuple[list[str], list[str]]:
-        """Define input requirements - expects FileGroupTask with URLs."""
-        return (["data"], [])
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        """Define output - produces FileGroupTask with local paths."""
-        return (["data"], [])
-
-    def process(self, task: FileGroupTask) -> FileGroupTask:
-        """Download URLs to local files.
-
-        Args:
-            task (FileGroupTask): Task containing URLs to download
-
-        Returns:
-            FileGroupTask: Task containing local file paths
-        """
-        local_files = []
-
-        for url in task.data:
-            downloaded_file = self.downloader.download(url)
-            if downloaded_file:
-                local_files.append(downloaded_file)
-
-        return FileGroupTask(
-            task_id=task.task_id,
-            dataset_name=task.dataset_name,
-            data=local_files,
-            _metadata={
-                **task._metadata,
-                "source_files": local_files,  # Add downloaded files for deterministic naming during write stage
-            },
-            _stage_perf=task._stage_perf,
-        )
-
-    def xenna_stage_spec(self) -> dict[str, Any]:
-        return {
-            "num_workers_per_node": self.downloader.num_workers_per_node(),
-        }
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/base/extract.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-from typing import Any
-
-
-class DocumentExtractor(ABC):
-    """Abstract base class for document extractors.
-
-    Takes a record dict and returns processed record dict or None to skip.
-    Can transform any fields in the input dict.
-    """
-
-    @abstractmethod
-    def extract(self, record: dict[str, str]) -> dict[str, Any] | None:
-        """Extract/transform a record dict into final record dict."""
-        ...
-
-    @abstractmethod
-    def input_columns(self) -> list[str]:
-        """Define input columns - produces DocumentBatch with records."""
-        ...
-
-    @abstractmethod
-    def output_columns(self) -> list[str]:
-        """Define output columns - produces DocumentBatch with records."""
-        ...
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/base/stage.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-
-from nemo_curator.stages.base import CompositeStage, ProcessingStage
-from nemo_curator.tasks import DocumentBatch, _EmptyTask
-
-from .download import DocumentDownloader, DocumentDownloadStage
-from .extract import DocumentExtractor
-from .iterator import DocumentIterateExtractStage, DocumentIterator
-from .url_generation import URLGenerationStage, URLGenerator
-
-
-@dataclass
-class DocumentDownloadExtractStage(CompositeStage[_EmptyTask, DocumentBatch]):
-    """Composite stage that combines URL generation, download, and iterate-extract stages.
-
-    This supports the full 3-step pipeline pattern like Common Crawl:
-    1. Generate URLs from minimal input
-    2. Download files from URLs
-    3. Iterate through files to extract structured content
-
-    """
-
-    url_generator: URLGenerator
-    downloader: DocumentDownloader
-    iterator: DocumentIterator
-    extractor: DocumentExtractor | None = None
-    url_limit: int | None = None
-    record_limit: int | None = None
-    add_filename_column: bool | str = True
-    # Restart worker Process every N tasks to mitigate memory fragmentation
-    # Only used if executor is Ray Data
-    extractor_max_calls_per_worker: int | None = None
-
-    def __post_init__(self):
-        """Initialize the constituent stages."""
-        # URL generation stage
-        url_stage = URLGenerationStage(
-            url_generator=self.url_generator,
-            limit=self.url_limit,
-        )
-
-        # Download stage
-        download_stage = DocumentDownloadStage(
-            downloader=self.downloader,
-        )
-
-        # Iterate-extract stage
-        iterate_extract_stage = DocumentIterateExtractStage(
-            iterator=self.iterator,
-            extractor=self.extractor,
-            record_limit=self.record_limit,
-            add_filename_column=self.add_filename_column,
-            max_calls_per_worker=self.extractor_max_calls_per_worker,
-        )
-
-        stages = [url_stage, download_stage, iterate_extract_stage]
-        self.stages = stages
-
-        url_generator_name = self.url_generator.__class__.__name__.lower()
-        downloader_name = self.downloader.__class__.__name__.lower()
-        self.name = f"document_download_extract_{url_generator_name}_{downloader_name}_composite"
-        super().__init__()
-
-    def decompose(self) -> list[ProcessingStage]:
-        """Decompose into constituent stages."""
-        return self.stages
-
-    def get_description(self) -> str:
-        """Get description of this composite stage."""
-        return f"URL-Download-Iterate-Extract pipeline using {self.url_generator.__class__.__name__} and {self.downloader.__class__.__name__}"
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/download/common_crawl/stage.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Literal
-
-from loguru import logger
-
-from nemo_curator.stages.base import ProcessingStage
-from nemo_curator.stages.text.download import DocumentDownloadExtractStage
-from nemo_curator.stages.text.download.html_extractors import HTMLExtractorAlgorithm
-from nemo_curator.stages.text.download.html_extractors.justext import JusTextExtractor
-
-from .download import CommonCrawlWARCDownloader
-from .extract import CommonCrawlHTMLExtractor
-from .url_generation import MainCommonCrawlUrlGenerator, NewsCommonCrawlUrlGenerator
-from .warc_iterator import CommonCrawlWarcIterator
-
-
-class CommonCrawlDownloadExtractStage(DocumentDownloadExtractStage):
-    """Composite stage for downloading and processing Common Crawl data.
-
-    This pipeline:
-    1. Generates WARC URLs (either from main or news crawls)
-    2. Downloads WARC files
-    3. Extracts content from WARC files
-    4. Extracts text from HTML content
-    """
-
-    def __init__(  # noqa: PLR0913
-        self,
-        start_snapshot: str,
-        end_snapshot: str,
-        download_dir: str,
-        crawl_type: Literal["main", "news"] = "main",
-        html_extraction: HTMLExtractorAlgorithm | str | None = None,
-        html_extraction_kwargs: dict | None = None,
-        stop_lists: dict[str, frozenset[str]] | None = None,
-        use_aws_to_download: bool = False,
-        verbose: bool = False,
-        url_limit: int | None = None,
-        record_limit: int | None = None,
-        add_filename_column: bool | str = True,
-        extractor_max_calls_per_worker: int | None = None,
-    ):
-        self.crawl_type = crawl_type
-        self.start_snapshot = start_snapshot
-        self.end_snapshot = end_snapshot
-
-        if crawl_type == "main":
-            self.url_generator = MainCommonCrawlUrlGenerator(
-                start_snapshot_str=start_snapshot, end_snapshot_str=end_snapshot, limit=url_limit
-            )
-        else:
-            self.url_generator = NewsCommonCrawlUrlGenerator(
-                start_snapshot_str=start_snapshot, end_snapshot_str=end_snapshot, limit=url_limit
-            )
-
-        self.downloader = CommonCrawlWARCDownloader(
-            download_dir=download_dir, use_aws_to_download=use_aws_to_download, verbose=verbose
-        )
-        self.iterator = CommonCrawlWarcIterator()
-        self.extractor = CommonCrawlHTMLExtractor(
-            algorithm=html_extraction,
-            algorithm_kwargs=html_extraction_kwargs,
-            stop_lists=stop_lists,
-        )
-        if extractor_max_calls_per_worker is None and isinstance(self.extractor.algorithm, JusTextExtractor):
-            extractor_max_calls_per_worker = 2
-            logger.info(
-                "jusText extraction can cause memory fragmentation and lead to OOM errors. "
-                "Setting extractor_max_calls_per_worker=2 for the iterate-extract stage. "
-                "Pass extractor_max_calls_per_worker explicitly to override."
-            )
-        super().__init__(
-            url_generator=self.url_generator,
-            downloader=self.downloader,
-            iterator=self.iterator,
-            extractor=self.extractor,
-            url_limit=url_limit,
-            record_limit=record_limit,
-            add_filename_column=add_filename_column,
-            extractor_max_calls_per_worker=extractor_max_calls_per_worker,
-        )
-        self.name = f"common_crawl_{self.crawl_type}_pipeline"
-
-    def decompose(self) -> list[ProcessingStage]:
-        """Decompose this composite stage into its constituent stages."""
-        return self.stages
-
-    def get_description(self) -> str:
-        """Get a description of this composite stage."""
-        return f"Common Crawl {self.crawl_type} pipeline: {self.start_snapshot} to {self.end_snapshot}"
-
-```
-
-File: /Users/mromeijn/src/Curator/tutorials/text/download-and-extract/README.md
-```md
-# Download and Extract Common Crawl, Wikipedia, and ArXiv Data
-
-This Jupyter notebook tutorial demonstrates how to use NeMo Curator to download text data from [Common Crawl](https://commoncrawl.org/), [Wikipedia](https://dumps.wikimedia.org/backup-index.html), and [ArXiv](https://info.arxiv.org/help/bulk_data_s3.html), respectively.
-
-For more information about downloading and extracting data with NeMo Curator, refer to the [Download Data](https://docs.nvidia.com/nemo/curator/latest/curate-text/load-data/index.html) and [Data Acquisition Concepts](https://docs.nvidia.com/nemo/curator/latest/about/concepts/text/data-acquisition-concepts.html) documentation pages.
-
-Please note that the ArXiv section of the tutorial requires the [s5cmd](https://github.com/peak/s5cmd) tool to be installed and configured with proper AWS credentials.
-
-```
-
-File: /Users/mromeijn/src/Curator/tutorials/text/llama-nemotron-data-curation/README.md
-```md
-# Curate the Llama Nemotron Reasoning Dataset with NVIDIA NeMo Curator
-
-The [Llama Nemotron Post-Training Dataset](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset) is a curated collection of approximately 30 million high-quality synthetic samples designed to enhance the reasoning capabilities of large language models.
-It is organized into distinct subsets for supervised fine-tuning (SFT) or reinforcement learning (RL) and encompasses samples from various problem domains.
-All samples are in JSON lines (JSONL) format and contain metadata such as license type, source model, as well as the [Llama Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/llama-nemotron/) model(s) trained with that sample.
-
-Each sample consists of a prompt and an expected response. Samples either include detailed chain-of-thought (CoT) reasoning traces followed by a response ("reasoning on"), or contain a direct response without reasoning traces ("reasoning off").
-Here is an example of what a sample from the dataset may look like:
-
-```json
-{
-  "input": [
-    {"role": "user", "content": "Can you explain the Pythagorean theorem?"}
-  ],
-  "output": "<think>The user is asking for an explanation of the Pythagorean theorem. This is a fundamental principle in geometry related to right-angled triangles. I should mention the formula and what each variable represents.</think>The Pythagorean theorem states that in a right triangle, the square of the hypotenuse equals the sum of the squares of the other two sides: a² + b² = c².",
-  "reasoning": "on",
-  "system_prompt": "detailed thinking on",
-  "category": "math",
-  "license": "apache_v2",
-  "generator": "llama-3.3-70b",
-  "used_in_training": ["Ultra"],
-  "version": "v1"
-}
-```
-
-The relevant attributes for this tutorial are as follows:
-
-- `input`: the prompt(s) to the model in the multi-turn chat completions message format. It always contains a message with the role `user`, followed by zero or more turns.
-- `output`: the expected response from the model (ground truth).
-- `reasoning`: whether the sample is for reasoning "on" mode or not
-    - If the value is "on", then the output contains a detailed CoT trace encoded inside think HTML tags followed by the output.
-    - If the value is "off", then the output doesn't contain any reasoning traces and contains a direct response.
-- `system_prompt`: the (suggested) system prompt to control the reasoning mode of the system. For Llama Nemotron training, the system prompt is always either "detailed thinking on" or "detailed thinking off". This field is tied to the value in the `reasoning` field.
-- `used_in_training`: the list of Llama Nemotron models that used this sample for training. For instance, a value of `["Ultra", "Nano"]` indicates that this sample was used for training the Ultra and Nano models, but not Super.
-
-This tutorial demonstrates how a user can process a subset of the Llama Nemotron dataset using NeMo Curator. The output files are created in the `input/output` JSONL format, suitable for use with various training frameworks, including [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo). You can easily modify this pipeline as you see fit and adapt it to your domain- or business-specific needs, and the resulting dataset can be used to train a reasoning model with a modest computing budget.
-
-## Environment Setup
-
-Setup requirements:
-
-- Hardware: This tutorial can be run entirely on CPU workers
-- Recommended environment: This tutorial was developed and tested with a Conda environment
-
-Refer to the NeMo Curator [documentation](https://docs.nvidia.com/nemo/curator/latest/) for instructions on how to download NeMo Curator through PyPI, source, or Docker.
-
-## Prerequisites
-
-### Download Input Dataset
-
-The input dataset can be downloaded from Hugging Face: https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset
-
-The following commands can be used to download the dataset:
-
-```bash
-# If needed: apt-get update && apt-get install -y git-lfs
-git lfs install
-git clone https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset
-```
-
-Alternatively, the dataset can be downloaded using Python:
-
-```python
-from huggingface_hub import snapshot_download
-
-snapshot_download(
-    repo_id="nvidia/Llama-Nemotron-Post-Training-Dataset", 
-    repo_type="dataset", 
-    local_dir="/path/to/save/data",
-    # allow_patterns=["SFT/chat/chat.jsonl", "SFT/math/math_v1.1.jsonl"],  # Select specific files or directories (if desired)
-)
-```
-
-Ensure that the dataset was downloaded correctly. You can verify with the following commands:
-
-```bash
-$ ls /path/to/Llama-Nemotron-Post-Training-Dataset/SFT
-chat  code  math  safety  science
-$ du -sh /path/to/Llama-Nemotron-Post-Training-Dataset/SFT
-122G    /path/to/Llama-Nemotron-Post-Training-Dataset/SFT
-```
-
-The above example ensures that the full SFT dataset was downloaded and is ready to use for the tutorial. If you only selected a subset of the data to download, then you should check that it matches the files on the [Hugging Face page](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset).
-
-### Tokenizer Access Instructions
-
-The tokenizer used by this tutorial is called [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Using it requires requesting access:
-
-1. Visit the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model page on Hugging Face.
-2. Click "Access request".
-3. Fill out the form and wait for approval.
-4. After approval, log in to your Hugging Face account using the Hugging Face CLI. In the terminal, run `huggingface-cli login`.
-
-### Download FastText Language Identification Model
-
-The FastText language identification model is used to identify and filter out non-English text from the dataset. It can be downloaded from the FastText language identification page: https://fasttext.cc/docs/en/language-identification.html
-
-Use the following command to download the FastText language identification model to your current working directory:
-
-```bash
-wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz -P ./
-```
-
-## Usage
-
-This tutorial can be run with:
-
-```bash
-LOGURU_LEVEL="ERROR" python main.py \
-    --input-dir "/path/to/Llama-Nemotron-Post-Training-Dataset/SFT" \
-    --filename-filter "chat" "math_v1.1" \
-    --jsonl-blocksize-mb 100 \
-    --tokenizer "meta-llama/Llama-3.1-8B-Instruct" \
-    --lang-id-model-path "/path/to/lid.176.ftz" \
-    --max-token-count 16384 \
-    --max-completion-token-count 8192 \
-    --keep-columns "input" "output" \
-    --output-dir "/path/to/curated-data" \
-    --num-cpus 16
-```
-
-Setting `LOGURU_LEVEL="ERROR"` minimizes log output. Remove it when debugging. If you encounter issues, see the **Debugging Out of Memory Errors** section for help (reducing `--num-cpus` is the most common fix).
-
-Set `--hf-token` as needed for the tokenizer.
-
-Since the entire input dataset is very large, we recommend curating a focused subset of the data that aligns closely with your domain-specific tasks. To help with this, we provide a way to filter files before reading. There are many ways to subset the Llama Nemotron dataset, but we recommend starting with the math and chat subsets because they contain strong examples of domain-agnostic reasoning. To filter files by name, pass `--filename-filter` followed by any number of strings, such as "chat" and "math_v1.1". When reading the input data directory, the list of files will be filtered to only include files with names containing at least one of the strings provided by `--filename-filter`. If `--filename-filter` is not specified, then all files within the directory (over 30 million rows) will be used.
-
-The above script applies basic filtering to the input dataset:
-
-- Only take samples used for Nemotron Nano training.
-- Remove empty and malformed samples.
-- Remove non-English samples.
-- Remove samples with total length (system prompt, input, and output responses) longer than 16k tokens (with chat template applied using the tokenizer).
-- Remove samples with output responses longer than 8k tokens (with chat template applied using the tokenizer).
-- Only keep columns specified by the `--keep-columns` parameter. We recommend keeping the "input", "output", and "completion_token_count" columns (the "completion_token_count" column always needs to be kept, so that the samples can be sorted).
-
-After filtering, it sorts all samples by completion (output response) length, then interleaves thinking ON and thinking OFF samples for curriculum learning. Samples are sorted in increasing order of difficulty, using the completion token count as a measure of difficulty. By default, records are interleaved one at a time (alternating one thinking ON sample with one thinking OFF sample). Pass `--chunk-size` followed by an integer to interleave in larger groups (for example, 10 or 100 records at a time). Interleaving samples from the "reasoning on" and "reasoning off" buckets gradually introduces complexity.
-
-## System Requirements
-
-- **Memory**: This tutorial can be CPU-only but is memory-intensive. For smaller memory systems, use `--filename-filter` to select a subset of the data.
-- **CPU allocation**: The `--num-cpus` parameter controls parallelism. Each CPU worker processes data in parallel, so more CPUs means more memory usage. Start with a conservative value and increase gradually.
-
-## Debugging Out-of-Memory Errors
-
-If you encounter out-of-memory (OOM) errors:
-
-1. **Reduce partition size**: Lower the blocksize to reduce per-partition memory. Set `--jsonl-blocksize-mb 50` (default is 100 MB).
-2. **Reduce CPU count**: Lower `--num-cpus` to reduce parallel memory pressure rather than using all available cores.
-3. **Subset the data**: Use `--filename-filter` to process only specific subsets relevant to your use case (such as `--filename-filter "chat"`).
-
-## Next Steps
-
-To see how to train a reasoning model with the resulting dataset, refer to this NeMo tutorial: [Train Your Own Reasoning Model in 48 Hours on a Single GPU](https://github.com/NVIDIA/NeMo/tree/main/tutorials/llm/reasoning).
-
-The NeMo tutorial expects the `/path/to/curated-data/training.jsonl` file generated by this tutorial as input.
-
-```
-
-File: /Users/mromeijn/src/Curator/tutorials/text/llama-nemotron-data-curation/main.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import time
-
-import ray
-from filters.heuristic_filters import (
-    ContainsThinkOpenTagFilter,
-    EmptyThinkTagsFilter,
-    MissingThinkCloseTagFilter,
-    MissingThinkOpenTagFilter,
-    NanoFilter,
-    ThinkingOnFilter,
-    malformed_filter,
-)
-from filters.model_filters import ApplyChatTemplate, CompletionTokenCountFilter, NonEnglishFilter, TokenCountFilter
-from utils.jsonl_utils import interleave_datasets
-
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.filters import ScoreFilter
-from nemo_curator.stages.text.io.reader.jsonl import JsonlReader
-from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter
-from nemo_curator.utils.file_utils import get_all_file_paths_under
-from nemo_curator.utils.split_large_files import split_jsonl_file_by_size
-
-
-def main(args: argparse.Namespace) -> None:  # noqa: PLR0915
-    try:
-        os.makedirs(args.output_dir, exist_ok=False)
-    except FileExistsError as e:
-        msg = f"Output directory already exists: {args.output_dir}. Please delete or rename it and try again."
-        raise FileExistsError(msg) from e
-
-    # Initialize and start Ray client with the number of CPUs specified by the user
-    ray_client = RayClient(num_cpus=args.num_cpus)
-    ray_client.start()
-
-    # Initialize pipelines
-    pipeline_thinking_on = Pipeline(
-        name="curriculum_learning_thinking_on", description="Prepare dataset for curriculum learning with thinking ON."
-    )
-    pipeline_thinking_off = Pipeline(
-        name="curriculum_learning_thinking_off",
-        description="Prepare dataset for curriculum learning with thinking OFF.",
-    )
-
-    start_time = time.time()
-
-    # Handle input path
-    input_files = list(get_all_file_paths_under(args.input_dir, recurse_subdirectories=True, keep_extensions="jsonl"))
-    if args.filename_filter:
-        # Filter out files that don't contain any of the provided substrings
-        input_files = [filename for filename in input_files if any(s in filename for s in args.filename_filter)]
-
-    input_dir = os.path.join(args.output_dir, "input_data_shards")
-    os.makedirs(input_dir, exist_ok=False)
-
-    # Split into smaller files for parallel processing
-    ray.get(
-        [
-            split_jsonl_file_by_size.remote(
-                input_file=f,
-                output_path=input_dir,
-                target_size_mb=args.jsonl_blocksize_mb,
-            )
-            for f in input_files
-        ]
-    )
-
-    # Read files for each pipeline
-    pipeline_thinking_on.add_stage(JsonlReader(file_paths=input_dir))
-    pipeline_thinking_off.add_stage(JsonlReader(file_paths=input_dir))
-
-    # Split pipelines into thinking ON and OFF
-    pipeline_thinking_on.add_stage(ScoreFilter(ThinkingOnFilter(), text_field="reasoning"))
-    pipeline_thinking_off.add_stage(ScoreFilter(ThinkingOnFilter(), text_field="reasoning", invert=True))
-
-    # Filter out samples based on various criteria
-    filter_steps = [
-        ScoreFilter(
-            NanoFilter(),
-            text_field="used_in_training",
-        ),
-        ScoreFilter(
-            EmptyThinkTagsFilter(),
-            text_field="output",
-        ),
-        malformed_filter,
-        ScoreFilter(
-            MissingThinkCloseTagFilter(),
-            text_field="output",
-        ),
-    ]
-    for filter_step in filter_steps:
-        pipeline_thinking_on.add_stage(filter_step)
-        pipeline_thinking_off.add_stage(filter_step)
-
-    # Filter out samples in thinking OFF that contain think tags
-    pipeline_thinking_off.add_stage(
-        ScoreFilter(
-            ContainsThinkOpenTagFilter(),
-            text_field="output",
-        )
-    )
-    # Filter out samples in thinking ON that do not contain think tags
-    pipeline_thinking_on.add_stage(
-        ScoreFilter(
-            MissingThinkOpenTagFilter(),
-            text_field="output",
-        )
-    )
-
-    # Filter out samples based on token count
-    tokenizer_steps = [
-        NonEnglishFilter(
-            tokenizer_identifier=args.tokenizer,
-            hf_token=args.hf_token,
-            lang_id_model_path=args.lang_id_model_path,
-            input_field="input",
-            output_field="output",
-            system_prompt_field="system_prompt",
-        ),
-        TokenCountFilter(
-            tokenizer_identifier=args.tokenizer,
-            hf_token=args.hf_token,
-            max_token_count=args.max_token_count,
-            input_field="input",
-            output_field="output",
-            system_prompt_field="system_prompt",
-        ),
-        CompletionTokenCountFilter(
-            tokenizer_identifier=args.tokenizer,
-            hf_token=args.hf_token,
-            max_completion_token_count=args.max_completion_token_count,
-            output_field="output",
-        ),
-        ApplyChatTemplate(
-            tokenizer_identifier=args.tokenizer,
-            hf_token=args.hf_token,
-            input_field="input",
-            output_field="output",
-            system_prompt_field="system_prompt",
-        ),
-    ]
-    for tokenizer_step in tokenizer_steps:
-        pipeline_thinking_on.add_stage(tokenizer_step)
-        pipeline_thinking_off.add_stage(tokenizer_step)
-
-    if args.keep_columns:
-        keep_columns = args.keep_columns
-        # Always keep the completion_token_count column, so that we can sort the samples
-        if "completion_token_count" not in keep_columns:
-            keep_columns.append("completion_token_count")
-    else:
-        keep_columns = ["input", "output", "completion_token_count"]
-
-    # Save intermediate datasets
-    thinking_on_unsorted_path = os.path.join(args.output_dir, "thinking_on_unsorted")
-    thinking_off_unsorted_path = os.path.join(args.output_dir, "thinking_off_unsorted")
-    pipeline_thinking_on.add_stage(JsonlWriter(thinking_on_unsorted_path, fields=keep_columns))
-    pipeline_thinking_off.add_stage(JsonlWriter(thinking_off_unsorted_path, fields=keep_columns))
-
-    # Run pipelines
-    _thinking_on_output = pipeline_thinking_on.run()
-    _thinking_off_output = pipeline_thinking_off.run()
-
-    # Sort datasets
-    thinking_on_ds = ray.data.read_json(thinking_on_unsorted_path, lines=True)
-    thinking_on_ds = thinking_on_ds.sort("completion_token_count")
-    thinking_on_sorted_path = os.path.join(args.output_dir, "thinking_on_sorted")
-    thinking_on_ds.write_json(thinking_on_sorted_path, orient="records", lines=True)
-
-    thinking_off_ds = ray.data.read_json(thinking_off_unsorted_path, lines=True)
-    thinking_off_ds = thinking_off_ds.sort("completion_token_count")
-    thinking_off_sorted_path = os.path.join(args.output_dir, "thinking_off_sorted")
-    thinking_off_ds.write_json(thinking_off_sorted_path, orient="records", lines=True)
-
-    # Interleave datasets and combine into a single output file
-    interleave_datasets(
-        thinking_on_sorted_path,
-        thinking_off_sorted_path,
-        os.path.join(args.output_dir, "training.jsonl"),
-        chunk_size=args.chunk_size,
-    )
-
-    end_time = time.time()
-    print(f"Total time taken: {end_time - start_time} seconds")
-
-    ray_client.stop()
-
-
-def attach_args() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        "Prepare dataset for curriculum learning.",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    parser.add_argument(
-        "--num-cpus",
-        type=int,
-        default=16,
-        help="Number of CPUs to use.",
-    )
-
-    parser.add_argument(
-        "--input-dir",
-        type=str,
-        help="Path to the input directory containing JSONL files.",
-        required=True,
-    )
-    parser.add_argument(
-        "--filename-filter",
-        nargs="+",
-        type=str,
-        help="If specified, only files with names containing one or more of the provided substrings will be processed.",
-    )
-    parser.add_argument(
-        "--jsonl-blocksize-mb",
-        type=int,
-        default=100,
-        help="Blocksize (in MB) to use for splitting the JSONL files.",
-    )
-
-    parser.add_argument(
-        "--tokenizer",
-        type=str,
-        default="meta-llama/Llama-3.1-8B-Instruct",
-        help="Hugging Face tokenizer",
-    )
-    parser.add_argument(
-        "--hf-token",
-        type=str,
-        help="Hugging Face token (if needed)",
-    )
-    parser.add_argument(
-        "--lang-id-model-path",
-        type=str,
-        help="Path to the FastText model",
-        required=True,
-    )
-    parser.add_argument(
-        "--max-token-count",
-        type=int,
-        default=16384,
-        help="Optional maximum token count. Rows exceeding this count will be filtered out.",
-    )
-    parser.add_argument(
-        "--max-completion-token-count",
-        type=int,
-        default=8192,
-        help="Optional maximum completion token count. Rows exceeding this count will be filtered out.",
-    )
-
-    parser.add_argument(
-        "--keep-columns",
-        nargs="+",
-        type=str,
-        help="Columns to keep when the dataset is written to disk.",
-    )
-
-    parser.add_argument(
-        "--chunk-size",
-        type=int,
-        default=1,
-        help="Chunk size to use for interleaving the datasets.",
-    )
-
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        help="Path to the output directory.",
-        required=True,
-    )
-
-    return parser
-
-
-if __name__ == "__main__":
-    main(attach_args().parse_args())
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/curator-processing-language-quality.txt b/skills/nemotron-customize/context/curator-processing-language-quality.txt
deleted file mode 100644
index 87c033d87..000000000
--- a/skills/nemotron-customize/context/curator-processing-language-quality.txt
+++ /dev/null
@@ -1,3302 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Curator
-├── docs
-│   ├── about
-│   │   ├── concepts
-│   │   │   ├── text
-│   │   │   │   ├── _images
-│   │   │   │   └── data-processing-concepts.md *
-│   │   │   ├── audio
-│   │   │   ├── image
-│   │   │   └── video
-│   │   │       └── _images
-│   │   └── release-notes
-│   ├── curate-text
-│   │   ├── process-data
-│   │   │   ├── language-management
-│   │   │   │   └── language.md *
-│   │   │   ├── quality-assessment
-│   │   │   │   ├── classifier.md *
-│   │   │   │   └── distributed-classifier.md *
-│   │   │   ├── content-processing
-│   │   │   ├── deduplication
-│   │   │   ├── specialized-processing
-│   │   │   └── index.md *
-│   │   ├── load-data
-│   │   ├── synthetic
-│   │   │   └── nemotron-cc
-│   │   └── tutorials
-│   ├── _extensions
-│   │   ├── ai_assistant
-│   │   │   ├── assets
-│   │   │   │   └── styles
-│   │   │   ├── core
-│   │   │   ├── integrations
-│   │   │   └── ui
-│   │   ├── content_gating
-│   │   ├── json_output
-│   │   │   ├── content
-│   │   │   ├── core
-│   │   │   └── processing
-│   │   ├── rich_metadata
-│   │   │   └── templates
-│   │   └── search_assets
-│   │       ├── modules
-│   │       └── templates
-│   ├── _images
-│   ├── _templates
-│   ├── admin
-│   │   ├── deployment
-│   │   │   └── slurm
-│   │   └── integrations
-│   ├── curate-audio
-│   │   ├── load-data
-│   │   ├── process-data
-│   │   │   ├── asr-inference
-│   │   │   ├── audio-analysis
-│   │   │   ├── quality-assessment
-│   │   │   └── text-integration
-│   │   └── tutorials
-│   ├── curate-images
-│   │   ├── load-data
-│   │   ├── process-data
-│   │   │   ├── embeddings
-│   │   │   └── filters
-│   │   └── tutorials
-│   ├── curate-video
-│   │   ├── load-data
-│   │   ├── process-data
-│   │   └── tutorials
-│   │       ├── _images
-│   │       └── pipeline-customization
-│   ├── get-started
-│   └── reference
-│       └── infrastructure
-├── nemo_curator
-│   ├── config
-│   │   └── text
-│   │       └── heuristic_filter_english_pipeline.yaml *
-│   ├── stages
-│   │   ├── text
-│   │   │   ├── classifiers
-│   │   │   │   ├── base.py * +
-│   │   │   │   ├── domain.py * +
-│   │   │   │   └── quality.py * +
-│   │   │   ├── filters
-│   │   │   │   ├── fasttext
-│   │   │   │   ├── heuristic
-│   │   │   │   │   ├── code
-│   │   │   │   │   └── repetition
-│   │   │   │   ├── histogram
-│   │   │   │   ├── token
-│   │   │   │   ├── doc_filter.py * +
-│   │   │   │   └── score_filter.py * +
-│   │   │   ├── deduplication
-│   │   │   ├── download
-│   │   │   │   ├── arxiv
-│   │   │   │   ├── base
-│   │   │   │   ├── common_crawl
-│   │   │   │   ├── html_extractors
-│   │   │   │   │   └── utils
-│   │   │   │   └── wikipedia
-│   │   │   ├── embedders
-│   │   │   ├── io
-│   │   │   │   ├── reader
-│   │   │   │   └── writer
-│   │   │   ├── models
-│   │   │   ├── modifiers
-│   │   │   │   ├── fasttext
-│   │   │   │   ├── string
-│   │   │   │   └── unicode
-│   │   │   ├── modules
-│   │   │   └── utils
-│   │   ├── audio
-│   │   │   ├── advanced_pipelines
-│   │   │   │   └── audio_data_filter
-│   │   │   ├── alm
-│   │   │   ├── datasets
-│   │   │   │   ├── fleurs
-│   │   │   │   └── readspeech
-│   │   │   ├── filtering
-│   │   │   │   ├── band_filter_module
-│   │   │   │   └── sigmos_filter_module
-│   │   │   │       └── third_party
-│   │   │   │           └── sigmos
-│   │   │   ├── inference
-│   │   │   ├── io
-│   │   │   ├── metrics
-│   │   │   ├── postprocessing
-│   │   │   ├── preprocessing
-│   │   │   └── segmentation
-│   │   │       └── speaker_separation_module
-│   │   ├── deduplication
-│   │   │   ├── exact
-│   │   │   ├── fuzzy
-│   │   │   │   └── lsh
-│   │   │   ├── semantic
-│   │   │   └── shuffle_utils
-│   │   ├── image
-│   │   │   ├── deduplication
-│   │   │   ├── embedders
-│   │   │   ├── filters
-│   │   │   └── io
-│   │   ├── interleaved
-│   │   │   ├── filter
-│   │   │   ├── io
-│   │   │   │   ├── readers
-│   │   │   │   └── writers
-│   │   │   ├── pdf
-│   │   │   │   └── nemotron_parse
-│   │   │   └── utils
-│   │   ├── math
-│   │   │   ├── classifiers
-│   │   │   ├── download
-│   │   │   │   └── html_extractors
-│   │   │   └── modifiers
-│   │   ├── synthetic
-│   │   │   ├── nemo_data_designer
-│   │   │   └── nemotron_cc
-│   │   │       └── nemo_data_designer
-│   │   └── video
-│   │       ├── caption
-│   │       ├── clipping
-│   │       ├── embedding
-│   │       ├── filtering
-│   │       ├── io
-│   │       └── preview
-│   ├── backends
-│   │   ├── internal
-│   │   │   └── raft
-│   │   ├── ray_actor_pool
-│   │   ├── ray_data
-│   │   └── xenna
-│   ├── core
-│   ├── metrics
-│   ├── models
-│   │   └── client
-│   ├── pipeline
-│   ├── tasks
-│   └── utils
-├── tutorials
-│   ├── text
-│   │   ├── distributed-data-classification
-│   │   │   └── README.md *
-│   │   ├── deduplication
-│   │   │   ├── fuzzy
-│   │   │   └── semantic
-│   │   ├── download-and-extract
-│   │   ├── gliner-pii-redaction
-│   │   ├── llama-nemotron-data-curation
-│   │   │   ├── filters
-│   │   │   └── utils
-│   │   ├── megatron-tokenizer
-│   │   ├── peft-curation
-│   │   └── tinystories
-│   ├── audio
-│   │   ├── alm
-│   │   ├── callhome_diar
-│   │   ├── fleurs
-│   │   ├── readspeech
-│   │   └── single_speaker_filter
-│   ├── image
-│   │   └── getting-started
-│   ├── interleaved
-│   │   └── nemotron_parse_pdf
-│   ├── math
-│   ├── multimodal
-│   ├── slurm
-│   ├── synthetic
-│   │   ├── nemo_data_designer
-│   │   └── nemotron_cc
-│   │       ├── example_data
-│   │       └── nemo_data_designer
-│   └── video
-│       └── getting-started
-├── .cursor
-│   └── rules
-├── .github
-│   ├── actions
-│   │   ├── build-container
-│   │   └── test-template
-│   ├── scripts
-│   └── workflows
-│       └── config
-├── benchmarking
-│   ├── data_prep
-│   ├── runner
-│   │   └── sinks
-│   ├── scripts
-│   └── tools
-├── docker
-│   └── common
-├── fern
-│   ├── assets
-│   │   └── images
-│   ├── components
-│   └── versions
-│       ├── v25.09
-│       │   └── pages
-│       │       ├── about
-│       │       │   ├── concepts
-│       │       │   │   ├── audio
-│       │       │   │   ├── image
-│       │       │   │   ├── text
-│       │       │   │   └── video
-│       │       │   └── release-notes
-│       │       ├── admin
-│       │       │   ├── deployment
-│       │       │   └── integrations
-│       │       ├── api-reference
-│       │       │   ├── executors
-│       │       │   └── tasks
-│       │       ├── curate-audio
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   │   ├── asr-inference
-│       │       │   │   ├── audio-analysis
-│       │       │   │   ├── quality-assessment
-│       │       │   │   └── text-integration
-│       │       │   └── tutorials
-│       │       ├── curate-images
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   │   ├── embeddings
-│       │       │   │   └── filters
-│       │       │   └── tutorials
-│       │       ├── curate-text
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   │   ├── content-processing
-│       │       │   │   ├── deduplication
-│       │       │   │   ├── language-management
-│       │       │   │   ├── quality-assessment
-│       │       │   │   └── specialized-processing
-│       │       │   └── tutorials
-│       │       ├── curate-video
-│       │       │   ├── load-data
-│       │       │   ├── process-data
-│       │       │   └── tutorials
-│       │       │       └── pipeline-customization
-│       │       ├── get-started
-│       │       └── reference
-│       │           └── infrastructure
-│       └── v26.02
-│           └── pages
-│               ├── _images
-│               ├── about
-│               │   ├── concepts
-│               │   │   ├── audio
-│               │   │   ├── image
-│               │   │   ├── text
-│               │   │   │   └── _images
-│               │   │   └── video
-│               │   │       └── _images
-│               │   └── release-notes
-│               ├── admin
-│               │   ├── deployment
-│               │   │   └── slurm
-│               │   └── integrations
-│               ├── api-reference
-│               │   ├── executors
-│               │   └── tasks
-│               ├── curate-audio
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   │   ├── asr-inference
-│               │   │   ├── audio-analysis
-│               │   │   ├── quality-assessment
-│               │   │   └── text-integration
-│               │   └── tutorials
-│               ├── curate-images
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   │   ├── embeddings
-│               │   │   └── filters
-│               │   └── tutorials
-│               ├── curate-text
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   │   ├── content-processing
-│               │   │   ├── deduplication
-│               │   │   ├── language-management
-│               │   │   ├── quality-assessment
-│               │   │   └── specialized-processing
-│               │   ├── synthetic
-│               │   │   └── nemotron-cc
-│               │   └── tutorials
-│               ├── curate-video
-│               │   ├── load-data
-│               │   ├── process-data
-│               │   └── tutorials
-│               │       ├── _images
-│               │       └── pipeline-customization
-│               ├── get-started
-│               └── reference
-│                   └── infrastructure
-└── tests
-    ├── backends
-    │   ├── ray_actor_pool
-    │   └── ray_data
-    ├── config
-    ├── core
-    ├── fixtures
-    │   └── audio
-    │       └── alm
-    │           └── nested_manifests
-    │               ├── subdir_a
-    │               └── subdir_b
-    ├── metrics
-    ├── models
-    │   └── client
-    ├── pipelines
-    ├── stages
-    │   ├── audio
-    │   │   ├── advanced_pipelines
-    │   │   ├── alm
-    │   │   ├── datasets
-    │   │   ├── filtering
-    │   │   ├── inference
-    │   │   ├── io
-    │   │   ├── metrics
-    │   │   ├── postprocessing
-    │   │   ├── preprocessing
-    │   │   └── segmentation
-    │   ├── common
-    │   ├── deduplication
-    │   │   ├── exact
-    │   │   ├── fuzzy
-    │   │   ├── semantic
-    │   │   └── shuffle_utils
-    │   ├── image
-    │   │   ├── dedup
-    │   │   ├── embedders
-    │   │   ├── filters
-    │   │   └── io
-    │   ├── interleaved
-    │   │   ├── filter
-    │   │   ├── pdf
-    │   │   │   └── nemotron_parse
-    │   │   └── utils
-    │   ├── math_stages
-    │   │   ├── classifiers
-    │   │   ├── download
-    │   │   └── modifiers
-    │   ├── synthetic
-    │   │   ├── nemo_data_designer
-    │   │   └── nemotron_cc
-    │   │       └── nemo_data_designer
-    │   ├── text
-    │   │   ├── classifiers
-    │   │   ├── deduplication
-    │   │   ├── download
-    │   │   │   ├── arxiv
-    │   │   │   ├── base
-    │   │   │   ├── common_crawl
-    │   │   │   └── wikipedia
-    │   │   ├── embedders
-    │   │   ├── io
-    │   │   │   ├── reader
-    │   │   │   └── writer
-    │   │   ├── models
-    │   │   └── modules
-    │   └── video
-    │       ├── caption
-    │       │   └── fixtures
-    │       ├── clipping
-    │       ├── embedding
-    │       ├── filtering
-    │       ├── io
-    │       └── preview
-    ├── tasks
-    └── utils
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; selected files shown.
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/base.py
-Imports:
-  - import contextlib
-  - import copy
-  - import time
-  - from abc import ABC, ABCMeta, abstractmethod
-  - from inspect import isabstract
-  - from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, final
-  - from loguru import logger
-  - from nemo_curator.stages.resources import Resources
-  - from nemo_curator.tasks import Task
-  - from nemo_curator.backends.base import NodeInfo, WorkerMetadata
----
-Classes:
-  - StageMeta
-    Methods:
-      - L46: def __new__(mcls, name, bases, namespace, **kwargs):
-  - ProcessingStage
-    Methods:
-      - L92: def _name(self) -> str:
-      - L97: def _resources(self) -> Resources:
-      - L102: def _batch_size(self) -> int | None:
-      - L106: def __init_subclass__(cls, **kwargs):
-      - L127: def num_workers(self) -> int | None:
-      - L131: def validate_input(self, task: Task) -> bool:
-      - L161: def process(self, task: X) -> Y | list[Y]:
-      - L171: def process_batch(self, tasks: list[X]) -> list[Y]:
-      - L201: def setup_on_node(self, node_info: NodeInfo | None = None, worker_metadata: WorkerMetadata | None = None) -> None:
-      - L209: def setup(self, worker_metadata: WorkerMetadata | None = None) -> None:
-      - L217: def teardown(self) -> None:
-      - L222: def supports_batch_processing(self) -> bool:
-      - L230: def __repr__(self) -> str:
-      - L234: def inputs(self) -> tuple[list[str], list[str]]:
-      - L244: def outputs(self) -> tuple[list[str], list[str]]:
-      - L254: def xenna_stage_spec(self) -> dict[str, Any]:
-      - L262: def with_(
-        self,
-        name: str | None = None,
-        resources: Resources | None = None,
-        batch_size: int | None = None,
-        runtime_env: dict[str, Any] | None = None,
-    ) -> ProcessingStage:
-      - L293: def get_config(self) -> dict[str, Any]:
-      - L305: def ray_stage_spec(self) -> dict[str, Any]:
-      - L316: def _log_metrics(self, metrics: dict[str, float]) -> None:
-      - L327: def _log_metric(self, name: str, value: float) -> None:
-      - L331: def _time_metric(self, name: str) -> contextlib.AbstractContextManager[None]:
-      - L339: def _consume_custom_metrics(self) -> dict[str, float]:
-    Properties:
-      - _is_abstract_root
-      - name
-      - resources
-      - batch_size
-      - runtime_env
-  - CompositeStage
-    Methods:
-      - L359: def __init__(self):
-      - L362: def inputs(self) -> tuple[list[str], list[str]]:
-      - L366: def outputs(self) -> tuple[list[str], list[str]]:
-      - L371: def decompose(self) -> list[ProcessingStage]:
-      - L381: def with_(self, stage_with_dict: dict[str, Any]) -> CompositeStage:
-      - L387: def decompose_and_apply_with(self) -> list[ProcessingStage]:
-      - L391: def _apply_with_(self, stages: list[ProcessingStage]) -> list[ProcessingStage]:
-      - L419: def process(self, task: X) -> Y | list[Y]:
-      - L425: def get_description(self) -> str:
-
-Functions:
-  - L62: def get_stage_class(name: str) -> type[ProcessingStage]:
-
-Global vars:
-  - X
-  - Y
-  - _STAGE_REGISTRY
----
-
-
-File: /Users/mromeijn/src/Curator/nemo_curator/tasks/document.py
-Imports:
-  - from dataclasses import dataclass, field
-  - import pandas as pd
-  - import pyarrow as pa
-  - from loguru import logger
-  - from .tasks import Task
----
-Classes:
-  - DocumentBatch
-    Methods:
-      - L34: def to_pyarrow(self) -> pa.Table:
-      - L44: def to_pandas(self) -> pd.DataFrame:
-      - L55: def num_items(self) -> int:
-      - L59: def get_columns(self) -> list[str]:
-      - L69: def validate(self) -> bool:
-    Properties:
-      - data
----
-
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Curator/docs/about/concepts/text/data-processing-concepts.md
-```md
----
-description: "Text processing workflows including quality filtering, fuzzy deduplication, content cleaning, and pipeline design"
-categories: ["concepts-architecture"]
-tags: ["data-processing", "quality-filtering", "deduplication", "pipeline", "distributed"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "concept"
-modality: "text-only"
----
-
-(about-concepts-text-data-processing)=
-# Text Processing Concepts
-
-This guide covers the most common text processing workflows in NVIDIA NeMo Curator, based on real-world usage patterns from production data curation pipelines.
-
-## Most Common Workflows
-
-The majority of NeMo Curator users follow these core workflows, typically in this order:
-
-### 1. Quality Filtering
-
-Most users start with basic quality filtering using heuristic filters to remove low-quality content:
-
-**Essential Quality Filters:**
-
-- `WordCountFilter` - Remove too short/long documents
-- `NonAlphaNumericFilter` - Remove symbol-heavy content
-- `RepeatedLinesFilter` - Remove if content is too repetitive
-- `PunctuationFilter` - Ensure proper sentence structure
-- `BoilerPlateStringFilter` - Remove if content contains too much template/boilerplate text
-
-### 2. Content Cleaning and Modification
-
-Basic text normalization and cleaning operations:
-
-**Common Cleaning Steps:**
-
-- `UnicodeReformatter` - Normalize Unicode characters
-- `NewlineNormalizer` - Standardize line breaks
-- Basic HTML/markup removal
-
-### 3. Deduplication
-
-Remove duplicate and near-duplicate content. For comprehensive coverage of all deduplication approaches, refer to Curator's [Deduplication Concepts](about-concepts-deduplication).
-
-#### Exact Deduplication
-
-Remove identical documents, especially useful for smaller datasets:
-
-**Implementation:** MD5 or SHA-256 hashing for document identification
-
-#### Fuzzy Deduplication
-
-For production datasets, fuzzy deduplication is essential to remove near-duplicate content across sources:
-
-**Key Components:**
-
-- Ray distributed computing framework for scalability
-- Connected components clustering for duplicate identification
-
-#### Semantic Deduplication
-
-Remove semantically similar content using embeddings for more sophisticated duplicate detection.
-
-## Core Processing Architecture
-
-NeMo Curator uses these fundamental building blocks that users combine into pipelines:
-
-```{list-table}
-:header-rows: 1
-
-* - Component
-  - Purpose
-  - Usage Pattern
-* - **`Pipeline`**
-  - Orchestrate processing stages
-  - Add processing stages, typically starting with a read and completing with a write
-* - **`ScoreFilter`**
-  - Apply filters with optional scoring
-  - Chain multiple quality filters
-* - **`Modify`**
-  - Transform document content
-  - Clean and normalize text
-* - **Reader/Writer Stages**
-  - Load and save text data
-  - Input/output for pipelines
-* - **Processing Stages**
-  - Transform DocumentBatch tasks
-  - Core processing components
-```
-
-## Implementation Examples
-
-### Complete Quality Filtering Pipeline
-
-This is the most common starting workflow, used in 90% of production pipelines:
-
-:::{dropdown} Quality Filtering Pipeline Code Example
-:icon: code-square
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.filters import ScoreFilter
-from nemo_curator.stages.text.filters.heuristic.repetition import RepeatedLinesFilter
-from nemo_curator.stages.text.filters.heuristic import (
-    WordCountFilter,
-    NonAlphaNumericFilter,
-    PunctuationFilter,
-    BoilerPlateStringFilter
-)
-
-# Start Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Create processing pipeline
-pipeline = Pipeline(name="quality_filtering")
-
-# Load dataset - the starting point for all workflows
-reader = JsonlReader(file_paths="input_data/")
-pipeline.add_stage(reader)
-
-# Standard quality filtering pipeline (most common)
-# Remove too short/long documents (essential)
-# and save the word_count field
-word_count_filter = ScoreFilter(
-    filter_obj=WordCountFilter(min_words=50, max_words=100000),
-    text_field="text",
-    score_field="word_count"
-)
-pipeline.add_stage(word_count_filter)
-
-# Remove symbol-heavy content
-alpha_numeric_filter = ScoreFilter(
-    filter_obj=NonAlphaNumericFilter(max_non_alpha_numeric_to_text_ratio=0.25),
-    text_field="text"
-)
-pipeline.add_stage(alpha_numeric_filter)
-
-# Remove repetitive content
-repeated_lines_filter = ScoreFilter(
-    filter_obj=RepeatedLinesFilter(max_repeated_line_fraction=0.7),
-    text_field="text"
-)
-pipeline.add_stage(repeated_lines_filter)
-
-# Ensure proper sentence structure
-punctuation_filter = ScoreFilter(
-    filter_obj=PunctuationFilter(max_num_sentences_without_endmark_ratio=0.85),
-    text_field="text"
-)
-pipeline.add_stage(punctuation_filter)
-
-# Remove template/boilerplate text
-boilerplate_filter = ScoreFilter(
-    filter_obj=BoilerPlateStringFilter(),
-    text_field="text"
-)
-pipeline.add_stage(boilerplate_filter)
-
-# Add writer stage
-writer = JsonlWriter(path="filtered_data/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()
-
-# Cleanup Ray when done
-ray_client.stop()
-```
-
-:::
-
-### Content Cleaning Pipeline
-
-Basic text normalization:
-
-:::{dropdown} Content Cleaning Pipeline Code Example
-:icon: code-square
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.modifiers import Modify
-from nemo_curator.stages.text.modifiers.unicode import UnicodeReformatter
-
-# Start Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Create cleaning pipeline
-pipeline = Pipeline(name="content_cleaning")
-
-# Read input data
-reader = JsonlReader(file_paths="input_data/")
-pipeline.add_stage(reader)
-
-# Essential cleaning steps
-# Normalize unicode characters (very common)
-unicode_modifier = Modify(
-    modifier_fn=UnicodeReformatter(),
-    input_fields="text"
-)
-pipeline.add_stage(unicode_modifier)
-
-# Additional processing steps can be added as needed
-
-# Write cleaned data
-writer = JsonlWriter(path="cleaned_data/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()
-
-# Cleanup Ray when done
-ray_client.stop()
-```
-
-:::
-
-### Exact Deduplication Workflow
-
-Exact deduplication for any dataset size (requires Ray and at least 1 GPU):
-
-:::{dropdown} Exact Deduplication Code Example
-:icon: code-square
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.stages.deduplication.exact.workflow import ExactDeduplicationWorkflow
-
-# Initialize Ray cluster with GPU support (required for exact deduplication)
-ray_client = RayClient(num_gpus=4)
-ray_client.start()
-
-# Configure exact deduplication workflow
-exact_workflow = ExactDeduplicationWorkflow(
-    input_path="/path/to/input/data",
-    output_path="/path/to/output",
-    text_field="text",
-    perform_removal=False,  # Currently only identification supported
-    assign_id=True,         # Automatically assign unique IDs
-    input_filetype="parquet",
-)
-
-# Run exact deduplication workflow
-exact_workflow.run()
-
-# Cleanup Ray when done
-ray_client.stop()
-```
-
-:::
-
-### Fuzzy Deduplication Workflow
-
-Critical for production datasets (requires Ray and at least 1 GPU):
-
-:::{dropdown} Fuzzy Deduplication Code Example
-:icon: code-square
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.stages.deduplication.fuzzy.workflow import FuzzyDeduplicationWorkflow
-
-# Initialize Ray cluster with GPU support (required for fuzzy deduplication)
-ray_client = RayClient(num_gpus=4)
-ray_client.start()
-
-# Configure fuzzy deduplication workflow (production settings)
-fuzzy_workflow = FuzzyDeduplicationWorkflow(
-    input_path="/path/to/input/data",
-    cache_path="/path/to/cache",
-    output_path="/path/to/output",
-    input_filetype="parquet",
-    input_blocksize="1.5GiB",
-    text_field="text",
-    perform_removal=False,  # Currently only identification supported
-    # LSH parameters for ~80% similarity threshold
-    num_bands=20,           # Number of LSH bands
-    minhashes_per_band=13,  # Hashes per band
-    char_ngrams=24,         # Character n-gram size
-    seed=42
-)
-
-# Run fuzzy deduplication workflow
-fuzzy_workflow.run()
-
-# Cleanup Ray when done
-ray_client.stop()
-```
-
-### Removing Identified Duplicates
-
-The identified duplicates can be removed using a separate workflow:
-
-:::{dropdown} Duplicate Removal Code Example
-:icon: code-square
-
-```python
-from nemo_curator.core.client import RayClient
-from nemo_curator.stages.text.deduplication.removal_workflow import TextDuplicatesRemovalWorkflow
-
-# Start Ray client
-ray_client = RayClient()
-ray_client.start()
-
-# Configure workflow with input dataset and output duplicate IDs
-removal_workflow = TextDuplicatesRemovalWorkflow(
-    input_path="/path/to/input/data",
-    ids_to_remove_path="/path/to/output/FuzzyDuplicateIds",
-    output_path="/path/to/deduplicated/output",
-    input_filetype="parquet",  # Same as identification workflow
-    input_blocksize="1.5GiB",  # Same as identification workflow
-    duplicate_id_field="_curator_dedup_id",
-    id_generator_path="/path/to/output/fuzzy_id_generator.json",
-)
-
-# Run removal workflow
-removal_workflow.run()
-
-# Cleanup Ray when done
-ray_client.stop()
-```
-
-:::
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/index.md
-```md
----
-description: "Process text data using comprehensive filtering, deduplication, content processing, and specialized tools for high-quality datasets"
-categories: ["workflows"]
-tags: ["data-processing", "filtering", "deduplication", "content-processing", "quality-assessment", "distributed"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "workflow"
-modality: "text-only"
----
-
-# Process Data for Text Curation
-
-Process text data you've loaded through NeMo Curator's {ref}`pipeline architecture <about-concepts-text-data-loading>`.
-
-NeMo Curator provides a comprehensive suite of tools for processing text data as part of the AI training pipeline. These tools help you analyze, transform, and filter your text datasets to ensure high-quality input for language model training.
-
-## How it Works
-
-NeMo Curator's text processing capabilities are organized into five main categories:
-
-1. **Language Management**: Handle multilingual content and language-specific processing
-2. **Content Processing & Cleaning**: Clean, normalize, and transform text content
-3. **Deduplication**: Remove duplicate and near-duplicate documents efficiently
-4. **Quality Assessment & Filtering**: Score and remove low-quality content using heuristics and ML classifiers
-5. **Specialized Processing**: Domain-specific processing for code and advanced curation tasks
-
-Each category provides specific implementations optimized for different curation needs. The result is a cleaned and filtered dataset ready for model training.
-
----
-
-## Language Management
-
-Handle multilingual content and language-specific processing requirements.
-
-::::{grid} 1 1 1 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`globe;1.5em;sd-mr-1` Language Identification
-:link: language-management/language
-:link-type: doc
-Identify document languages and separate multilingual datasets
-+++
-{bdg-secondary}`fasttext`
-{bdg-secondary}`176-languages`
-{bdg-secondary}`detection`
-:::
-
-:::{grid-item-card} {octicon}`filter;1.5em;sd-mr-1` Stop Words
-:link: language-management/stopwords
-:link-type: doc
-Manage high-frequency words to enhance text extraction and content detection
-+++
-{bdg-secondary}`preprocessing`
-{bdg-secondary}`filtering`
-{bdg-secondary}`language-specific`
-:::
-
-::::
-
-## Content Processing & Cleaning
-
-Clean, normalize, and transform text content for high-quality training data.
-
-::::{grid} 1 1 1 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`typography;1.5em;sd-mr-1` Text Cleaning
-:link: content-processing/text-cleaning
-:link-type: doc
-Fix Unicode issues, standardize spacing, and remove URLs
-+++
-{bdg-secondary}`unicode`
-{bdg-secondary}`normalization`
-{bdg-secondary}`preprocessing`
-:::
-
-::::
-
-## Deduplication
-
-Remove duplicate and near-duplicate documents efficiently from your text datasets. All deduplication methods support both identification (finding duplicates) and removal (filtering them out) workflows.
-
-::::{grid} 1 1 1 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`git-pull-request;1.5em;sd-mr-1` Exact Duplicate Removal
-:link: deduplication/exact
-:link-type: doc
-Identify and remove character-for-character duplicates using MD5 hashing
-+++
-{bdg-secondary}`hashing`
-{bdg-secondary}`fast`
-{bdg-secondary}`gpu-accelerated`
-:::
-
-:::{grid-item-card} {octicon}`git-compare;1.5em;sd-mr-1` Fuzzy Duplicate Removal
-:link: deduplication/fuzzy
-:link-type: doc
-Identify and remove near-duplicates using MinHash and LSH similarity
-+++
-{bdg-secondary}`minhash`
-{bdg-secondary}`lsh`
-{bdg-secondary}`gpu-accelerated`
-:::
-
-:::{grid-item-card} {octicon}`repo-clone;1.5em;sd-mr-1` Semantic Deduplication
-:link: deduplication/semdedup
-:link-type: doc
-Identify and remove semantically similar documents using embeddings and clustering
-+++
-{bdg-secondary}`embeddings`
-{bdg-secondary}`meaning-based`
-{bdg-secondary}`gpu-accelerated`
-:::
-
-::::
-
-## Quality Assessment & Filtering
-
-Score and remove low-quality content using heuristics and ML classifiers.
-
-::::{grid} 1 1 1 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`filter;1.5em;sd-mr-1` Heuristic Filtering
-:link: quality-assessment/heuristic
-:link-type: doc
-Filter text using configurable rules and metrics
-+++
-{bdg-secondary}`rules`
-{bdg-secondary}`metrics`
-{bdg-secondary}`fast`
-:::
-
-:::{grid-item-card} {octicon}`cpu;1.5em;sd-mr-1` Classifier Filtering
-:link: quality-assessment/classifier
-:link-type: doc
-Filter text using trained quality classifiers
-+++
-{bdg-secondary}`ml-models`
-{bdg-secondary}`quality`
-{bdg-secondary}`scoring`
-:::
-
-:::{grid-item-card} {octicon}`cpu;1.5em;sd-mr-1` Distributed Classification
-:link: quality-assessment/distributed-classifier
-:link-type: doc
-GPU-accelerated classification with pre-trained models
-+++
-{bdg-secondary}`gpu`
-{bdg-secondary}`distributed`
-{bdg-secondary}`scalable`
-:::
-
-::::
-
-## Specialized Processing
-
-Domain-specific processing for code and advanced curation tasks.
-
-::::{grid} 1 1 1 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`code;1.5em;sd-mr-1` Code Processing
-:link: specialized-processing/code
-:link-type: doc
-Specialized filters for programming content and source code
-+++
-{bdg-secondary}`programming`
-{bdg-secondary}`syntax`
-{bdg-secondary}`comments`
-:::
-
-::::
-
-```{toctree}
-:maxdepth: 4
-:titlesonly:
-:hidden:
-
-Language Management <language-management/index>
-Content Processing & Cleaning <content-processing/index>
-Deduplication <deduplication/index>
-Quality Assessment & Filtering <quality-assessment/index>
-Specialized Processing <specialized-processing/index>
-```
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/language-management/language.md
-```md
----
-description: "Identify document languages accurately using FastText models supporting 176 languages for multilingual text processing"
-categories: ["how-to-guides"]
-tags: ["language-identification", "fasttext", "multilingual", "176-languages", "detection", "classification"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "how-to"
-modality: "text-only"
----
-
-# Language Identification
-
-(text-process-data-languages-id)=
-
-Large unlabeled text corpora often contain a variety of languages. NVIDIA NeMo Curator provides tools to accurately identify the language of each document, which is essential for language-specific curation tasks and building high-quality monolingual datasets.
-
-## How it Works
-
-NeMo Curator's language identification system works through a three-step process:
-
-1. **Text Preprocessing**: For FastText classification, normalize input text by stripping whitespace and converting newlines to spaces.
-
-2. **FastText Language Detection**: The pre-trained FastText language identification model ([`lid.176.bin`](https://fasttext.cc/docs/en/language-identification.html)) analyzes the preprocessed text and returns:
-   - A confidence score (0.0 to 1.0) indicating certainty of the prediction
-   - A language code (for example, "EN", "ES", "FR") in FastText's two-letter uppercase format
-
-3. **Filtering and Scoring**: The pipeline filters documents based on a configurable confidence threshold (`min_langid_score`) and stores both the confidence score and language code as metadata.
-
-### Language Detection Process
-
-The `FastTextLangId` filter implements this workflow by:
-
-- Loading the FastText language identification model on worker initialization
-- Processing text through `model.predict()` with `k=1` to get the top language prediction
-- Extracting the language code from FastText labels (for example, `__label__en` becomes "EN")
-- Comparing confidence scores against the threshold to determine document retention
-- Returning results as `[confidence_score, language_code]` for downstream processing
-
-This approach supports **176 languages** with high accuracy, making it suitable for large-scale multilingual dataset curation where language-specific processing and monolingual dataset creation are critical.
-
-## Usage
-
-The following example demonstrates how to create a language identification pipeline using Curator with distributed processing.
-
-::::{tab-set}
-
-:::{tab-item} Python
-
-```python
-"""Language identification using Curator."""
-
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.filters.fasttext import FastTextLangId
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.filters import ScoreFilter
-
-def create_language_identification_pipeline(data_dir: str) -> Pipeline:
-    """Create a pipeline for language identification."""
-
-    # Define pipeline
-    pipeline = Pipeline(
-        name="language_identification",
-        description="Identify document languages using FastText"
-    )
-
-    # Add stages
-    # 1. Reader stage - creates tasks from JSONL files
-    pipeline.add_stage(
-        JsonlReader(
-            file_paths=data_dir,
-            files_per_partition=2,  # Each task processes 2 files
-        )
-    )
-
-    # 2. Language identification with filtering
-    # IMPORTANT: Download lid.176.bin or lid.176.ftz from https://fasttext.cc/docs/en/language-identification.html
-    fasttext_model_path = "/path/to/lid.176.bin"  # or lid.176.ftz (compressed)
-    pipeline.add_stage(
-        ScoreFilter(
-            FastTextLangId(model_path=fasttext_model_path, min_langid_score=0.3),
-            score_field="language"
-        )
-    )
-
-    return pipeline
-
-def main():
-    # Create pipeline
-    pipeline = create_language_identification_pipeline("./data")
-
-    # Print pipeline description
-    print(pipeline.describe())
-
-    # Create executor and run
-    results = pipeline.run()
-
-    # Process results
-
-    total_documents = sum(task.num_items for task in results) if results else 0
-    print(f"Total documents processed: {total_documents}")
-
-    # Access language scores
-    for i, batch in enumerate(results):
-        if batch.num_items > 0:
-            df = batch.to_pandas()
-            print(f"Batch {i} columns: {list(df.columns)}")
-            # Language scores are now in the 'language' field
-
-if __name__ == "__main__":
-    main()
-```
-
-:::
-::::
-
-## Understanding Results
-
-The language identification process adds a score field to each document batch:
-
-1. **`language` field**: Contains the FastText language identification results as a string representation of a list with two elements (for backend compatibility):
-   - Element 0: The confidence score (between 0 and 1)
-   - Element 1: The language code in FastText format (for example, "EN" for English, "ES" for Spanish)
-
-2. **Task-based processing**: Curator processes documents in batches (tasks), and results are available through the task's Pandas DataFrame:
-
-```python
-# Access results from pipeline execution
-for batch in results:
-    df = batch.to_pandas()
-    # Language scores are in the 'language' column
-    print(df[['text', 'language']].head())
-```
-
-:::{tip}
-For quick exploratory inspection, converting a `DocumentBatch` to a Pandas DataFrame is fine. For performance and scalability, write transformations as `ProcessingStage`s (or with the `@processing_stage` decorator) and run them inside a `Pipeline` with an executor. Curator’s parallelism and resource scheduling apply when code runs as pipeline stages; ad‑hoc Pandas code executes on the driver and will not scale.
-:::
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/quality-assessment/classifier.md
-```md
----
-description: "Filter text using trained quality classifiers including FastText models and pre-trained language classification"
-categories: ["how-to-guides"]
-tags: ["classifier-filtering", "fasttext", "ml-models", "quality", "training", "scoring"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "how-to"
-modality: "text-only"
----
-
-(text-process-data-filter-classifier)=
-
-# Classifier-Based Filtering
-
-Classifier-based filtering uses machine learning models to differentiate between high-quality and low-quality documents. NVIDIA NeMo Curator implements an approach similar to the one described in [Brown et al., 2020](https://arxiv.org/abs/2005.14165), which trains a binary skip-gram classifier to distinguish between curated high-quality data and lower-quality data.
-
-## How It Works
-
-Classifier-based filtering learns the characteristics of high-quality documents from training data, unlike heuristic filtering which relies on predefined rules and thresholds. This approach is particularly effective when:
-
-- You have a reference dataset of known high-quality documents
-- The distinction between high and low quality is complex or subtle
-- You want to filter based on domain-specific characteristics
-
-NVIDIA NeMo Curator uses [fastText](https://fasttext.cc/) for implementing classifier-based filtering, which offers excellent performance and scalability for text classification tasks.
-
-:::{note}
-fastText is the official name and capitalization used by the fastText library created by Facebook Research.
-:::
-
-The classifier-based filtering process involves:
-
-1. Preparing training data by sampling from high-quality and low-quality datasets
-2. Training a binary skip-gram classifier using fastText
-3. Using the trained model to score documents in your dataset
-4. Filtering documents based on the classifier scores, optionally using Pareto-based sampling
-
----
-
-## Usage
-
-
-NeMo Curator provides two approaches for quality assessment:
-
-1. **Classification**: Use `QualityClassifier` to add quality predictions and optionally filter during classification
-2. **Filtering**: Use `FastTextQualityFilter` with `ScoreFilter` for document-level filtering with Pareto sampling
-
-:::{note}
-If you need to train custom fastText models for specific domains or requirements, refer to the [fastText documentation](https://fasttext.cc/docs/en/supervised-tutorial.html) for comprehensive training guides.
-:::
-
-::::{tab-set}
-
-:::{tab-item} DeBERTa Quality Classification
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import QualityClassifier
-
-# Create pipeline with DeBERTa quality classifier
-pipeline = Pipeline(name="deberta_quality_pipeline")
-
-# Add stages
-read_stage = JsonlReader("input_data/")
-classify_stage = QualityClassifier(
-    filter_by=["High"],  # Keep only high-quality documents
-    model_inference_batch_size=256,
-    max_chars=6000  # Default value
-)
-write_stage = JsonlWriter("high_quality_output/")
-
-pipeline.add_stage(read_stage)
-pipeline.add_stage(classify_stage)
-pipeline.add_stage(write_stage)
-
-# Execute pipeline
-results = pipeline.run()
-```
-
-:::
-
-:::{tab-item} FastText Quality Filter
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.filters import ScoreFilter
-from nemo_curator.stages.text.filters.fasttext import FastTextQualityFilter
-
-# Create pipeline with FastText filter (requires pre-trained model)
-pipeline = Pipeline(name="fasttext_quality_pipeline")
-
-# Add stages
-read_stage = JsonlReader("input_data/")
-filter_stage = ScoreFilter(
-    FastTextQualityFilter(
-        model_path="./quality_classifier.bin",  # Path to your fastText model
-        label="__label__hq",  # High quality label
-        alpha=3,              # Pareto distribution alpha parameter
-        seed=42               # Random seed for reproducibility
-    ),
-    text_field="text",
-    score_field="quality_score"
-)
-write_stage = JsonlWriter("high_quality_output/")
-
-pipeline.add_stage(read_stage)
-pipeline.add_stage(filter_stage)
-pipeline.add_stage(write_stage)
-
-# Execute pipeline
-results = pipeline.run()
-```
-
-:::
-
-:::{tab-item} Configuration
-
-You can configure quality classifiers and filters with different parameters:
-
-```python
-from nemo_curator.stages.text.classifiers import QualityClassifier
-from nemo_curator.stages.text.filters.fasttext import FastTextQualityFilter
-
-# DeBERTa quality classifier configurations
-basic_deberta_classifier = QualityClassifier(
-    filter_by=["High"],          # Keep only high-quality documents
-    model_inference_batch_size=256,
-    max_chars=6000               # Default value
-)
-
-# More inclusive DeBERTa classifier
-inclusive_deberta_classifier = QualityClassifier(
-    filter_by=["Medium", "High"], # Keep medium and high-quality documents
-    model_inference_batch_size=128,
-    max_chars=6000
-)
-
-# FastText quality filter configurations
-basic_fasttext_filter = FastTextQualityFilter(
-    model_path="./quality_classifier.bin",
-    label="__label__hq",         # High quality label
-    alpha=3,                     # Pareto distribution alpha parameter
-    seed=42                      # Random seed for reproducibility
-)
-
-# More selective FastText filter
-selective_fasttext_filter = FastTextQualityFilter(
-    model_path="./quality_classifier.bin",
-    label="__label__hq",
-    alpha=5,                     # Higher alpha for stricter filtering
-    seed=42
-)
-```
-
-:::
-
-::::
-
-## Quality Classifier and Filter Parameters
-
-### QualityClassifier (DeBERTa)
-
-The `QualityClassifier` accepts the following parameters:
-
-- `filter_by` (list, default=None): Quality levels to keep (options: "Low", "Medium", "High")
-- `model_inference_batch_size` (int, default=256): Batch size for inference
-- `max_chars` (int, default=6000): Max characters per document for processing
-- `label_field` (str, default="quality_pred"): Name of the prediction column
-- `text_field` (str, default="text"): Name of the text field in input data
-
-### FastTextQualityFilter
-
-The `FastTextQualityFilter` accepts the following parameters:
-
-- `model_path` (str, required): Path to the trained fastText model file
-- `label` (str, default="__label__hq"): The label for high-quality documents
-- `alpha` (float, default=3): Alpha parameter for Pareto distribution sampling
-- `seed` (int, default=42): Random seed for reproducible sampling
-
-## Best Practices
-
-For effective classifier-based filtering:
-
-1. **Model selection**: Start with the DeBERTa quality classifier for general use cases; consider fastText for high-throughput scenarios
-2. **Validation**: Manually review a sample of filtered results to confirm effectiveness
-3. **Quality level tuning**: Adjust `filter_by` levels (DeBERTa) or `alpha` values (fastText) based on your quality requirements
-4. **Batch size optimization**: Tune `model_inference_batch_size` for DeBERTa models based on your available memory
-5. **Combination with heuristics**: Consider using heuristic filters as a pre-filter to improve efficiency
-6. **Domain adaptation**: For specialized corpora, consider training custom models using domain-specific data
-
-```
-
-File: /Users/mromeijn/src/Curator/docs/curate-text/process-data/quality-assessment/distributed-classifier.md
-```md
----
-description: "Perform distributed data classification using GPU-accelerated models for domain, quality, safety, and content assessment"
-categories: ["how-to-guides"]
-tags: ["distributed-classification", "gpu", "domain", "quality", "safety", "crossfit", "scalable"]
-personas: ["data-scientist-focused", "mle-focused"]
-difficulty: "intermediate"
-content_type: "how-to"
-modality: "text-only"
----
-
-(text-process-data-filter-dist-classifier)=
-
-# Distributed Data Classification
-
-NVIDIA NeMo Curator provides a module for performing distributed classification on large text datasets using GPU acceleration. This enables the categorization and filtering of text documents based on multiple dimensions such as domain, quality, safety, educational value, content type, and more. These classifications can enhance the quality of training data for large language models by identifying high-value content and removing problematic material.
-
-## How It Works
-
-The distributed data classification in NeMo Curator works by:
-
-1. **Parallel Processing**: Chunking datasets across multiple computing nodes and GPUs to accelerate classification
-2. **Pre-trained Models**: Using specialized models for different classification tasks
-3. **Batched Inference**: Optimizing throughput with intelligent batching
-4. **Consistent API**: Providing a unified interface through the `DistributedDataClassifier` base class
-
-The `DistributedDataClassifier` is designed to run on GPU clusters with minimal code changes regardless of which specific classifier you're using. All classifiers support filtering based on classification results and storing prediction scores as metadata.
-
-:::{note}
-Distributed classification requires GPU acceleration and is not supported for CPU-only processing. As long as GPU resources are available and NeMo Curator is correctly installed, GPU acceleration is handled automatically.
-:::
-
-```{tip}
-**Running the tutorial notebooks**: The classification tutorial notebooks require the `text_cuda12` or `all` installation extra to include all relevant dependencies. If you encounter `ModuleNotFoundError`, reinstall with the appropriate extra:
-
-    uv pip install "nemo-curator[text_cuda12]"
-
-When using classifiers that download from Hugging Face (such as Aegis and InstructionDataGuard), set your `HF_TOKEN` environment variable to avoid rate limiting:
-
-    export HF_TOKEN="your_token_here"
-```
-
----
-
-## Usage
-
-NVIDIA NeMo Curator provides a base class `DistributedDataClassifier` that can be extended to fit your specific model. The only requirement is that the model can fit on a single GPU. This module operates on the GPU and works within the pipeline framework using DocumentBatch processing.
-
-### Classifier Comparison
-
-| Classifier | Purpose | Model Location | Key Parameters | Requirements |
-|---|---|---|---|---|
-| DomainClassifier | Assigns one of 26 domain labels (such as "Sports," "Science," "News") to English text | [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier) | `filter_by`, `text_field` | None |
-| MultilingualDomainClassifier | Assigns domain labels to text in 52 languages; same labels as DomainClassifier | [nvidia/multilingual-domain-classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) | `filter_by`, `text_field` | None |
-| QualityClassifier | Rates document quality as "Low," "Medium," or "High" using a DeBERTa model | [nvidia/quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) | `filter_by`, `text_field` | None |
-| AegisClassifier | Detects unsafe content across 13 risk categories (violence, hate speech, and others) using LlamaGuard | [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) | `aegis_variant`, `filter_by` | HuggingFace token |
-| InstructionDataGuardClassifier | Identifies LLM poisoning attacks in instruction-response pairs | [nvidia/instruction-data-guard](https://huggingface.co/nvidia/instruction-data-guard) | `text_field`, `label_field` | HuggingFace token |
-| FineWebEduClassifier | Scores educational value from 0 to 5 (0=spam, 5=scholarly) for training data selection | [HuggingFaceFW/fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) | `label_field`, `int_field` | None |
-| FineWebMixtralEduClassifier | Scores educational value from 0 to 5 using Mixtral 8x22B annotation data | [nvidia/nemocurator-fineweb-mixtral-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier) | `label_field`, `int_field`, `model_inference_batch_size=1024` | None |
-| FineWebNemotronEduClassifier | Scores educational value from 0 to 5 using Nemotron-4-340B annotation data | [nvidia/nemocurator-fineweb-nemotron-4-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier) | `label_field`, `int_field`, `model_inference_batch_size=1024` | None |
-| ContentTypeClassifier | Categorizes text into 11 speech types (such as "Blogs," "News," "Academic") | [nvidia/content-type-classifier-deberta](https://huggingface.co/nvidia/content-type-classifier-deberta) | `filter_by`, `text_field` | None |
-| PromptTaskComplexityClassifier | Labels prompts by task type (such as QA and summarization) and complexity dimensions | [nvidia/prompt-task-and-complexity-classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) | `text_field` | None |
-
-### Domain Classifier
-
-The Domain Classifier categorizes English text documents into specific domains or subject areas.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import DomainClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="domain_classification")
-
-# Load dataset
-reader = JsonlReader(
-    file_paths="books_dataset/",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the classifier, filtering for specific domains
-domain_classifier = DomainClassifier(filter_by=["Games", "Sports"])
-pipeline.add_stage(domain_classifier)
-
-# Save the results
-writer = JsonlWriter(path="games_and_sports/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-### Multilingual Domain Classifier
-
-Functionally similar to the Domain Classifier, but supports 52 languages.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import MultilingualDomainClassifier
-
-pipeline = Pipeline(name="multilingual_domain_classification")
-pipeline.add_stage(JsonlReader(file_paths="multilingual_dataset/", fields=["text", "id"]))
-pipeline.add_stage(MultilingualDomainClassifier(filter_by=["Games", "Sports"]))
-pipeline.add_stage(JsonlWriter(path="classified_output/"))
-
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-### Quality Classifier
-
-The Quality Classifier assesses document quality using the NVIDIA Quality Classifier DeBERTa model.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import QualityClassifier
-
-pipeline = Pipeline(name="quality_classification")
-pipeline.add_stage(JsonlReader(file_paths="web_documents/", fields=["text", "id"]))
-pipeline.add_stage(QualityClassifier())
-pipeline.add_stage(JsonlWriter(path="quality_classified/"))
-
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-:::{note}
-The exact label categories returned by the Quality Classifier depend on the model configuration. Check the prediction column in your results to see the available labels for filtering with the `filter_by` parameter.
-:::
-
-### AEGIS Safety Classifier
-
-The AEGIS classifier detects unsafe content across 13 critical risk categories. It requires a HuggingFace token for access to Llama Guard.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import AegisClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="aegis_classification")
-
-# Load dataset
-reader = JsonlReader(
-    file_paths="content/",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the AEGIS classifier
-token = "hf_1234"  # Your HuggingFace user access token
-safety_classifier = AegisClassifier(
-    aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0",
-    hf_token=token,
-    filter_by=["safe", "O13"]  # Keep only safe content and "needs caution" category
-)
-pipeline.add_stage(safety_classifier)
-
-# Save the results
-writer = JsonlWriter(path="safe_content/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-The classifier adds a column with labels: "safe," "O1" through "O13" (each representing specific safety risks), or "unknown." For raw LLM output, use:
-
-```python
-safety_classifier = AegisClassifier(
-    aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0",
-    hf_token=token,
-    keep_raw_output=True,
-    raw_output_field="raw_predictions"
-)
-```
-
-### Instruction Data Guard
-
-Detects LLM poisoning attacks in instruction-response datasets. Requires HuggingFace token access.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import InstructionDataGuardClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="instruction_data_guard")
-
-# Load dataset
-# For instruction-response data: "Instruction: {instruction}. Input: {input_}. Response: {response}."
-reader = JsonlReader(
-    file_paths="instruction_data/",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the classifier
-token = "hf_1234"  # Your HuggingFace user access token
-classifier = InstructionDataGuardClassifier(hf_token=token)
-pipeline.add_stage(classifier)
-
-# Save the results
-writer = JsonlWriter(path="guard_classified/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-The output includes two columns: a float score `instruction_data_guard_poisoning_score` and a Boolean `is_poisoned`.
-
-### FineWeb Educational Content Classifier
-
-Scores documents on educational value from 0–5. This helps prioritize content for knowledge-intensive tasks.
-
-#### Score Ranges and Meanings
-
-| Score | Label | Description | Example Content |
-|-------|-------|-------------|-----------------|
-| 0-1 | Very Low | No educational value | Spam, advertisements, broken content |
-| 2 | Low | Minimal educational content | Simple lists, basic product descriptions |
-| 3 | Moderate | Some educational value | News articles, basic how-to guides |
-| 4 | High | Good educational content | Detailed tutorials, academic discussions |
-| 5 | Very High | Excellent educational material | Comprehensive guides, scholarly articles |
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import FineWebEduClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="fineweb_edu_classification")
-
-# Load dataset
-reader = JsonlReader(
-    file_paths="web_documents/*.jsonl",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the FineWeb Edu classifier
-edu_classifier = FineWebEduClassifier(
-    model_inference_batch_size=256,
-    float_score_field="fineweb-edu-score-float",  # Raw float scores
-    int_score_field="fineweb-edu-score-int",      # Rounded integer scores
-    label_field="fineweb-edu-score-label"         # Quality labels
-)
-pipeline.add_stage(edu_classifier)
-
-# Save the results
-writer = JsonlWriter(path="edu_classified/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-### FineWeb Mixtral and Nemotron Edu Classifiers
-
-Similar to the FineWeb Edu Classifier but trained with different annotation sources:
-
-- **FineWebMixtralEduClassifier**: Uses annotations from Mixtral 8x22B-Instruct
-- **FineWebNemotronEduClassifier**: Uses annotations from Nemotron-4-340B-Instruct
-
-Both provide a quality label column marking scores above 2.5 as "high_quality":
-
-#### Quality Label Mapping
-
-| Score Range | Quality Label | Description |
-|-------------|---------------|-------------|
-| 0.0 - 2.5 | `low_quality` | Below average educational value |
-| 2.5 - 5.0 | `high_quality` | Above average educational value |
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import FineWebMixtralEduClassifier  # or FineWebNemotronEduClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="fineweb_mixtral_edu_classification")
-
-# Load dataset
-reader = JsonlReader(
-    file_paths="web_documents/*.jsonl",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the FineWeb Mixtral Edu classifier
-classifier = FineWebMixtralEduClassifier(
-    float_score_field="fineweb-mixtral-edu-score-float",  # Raw float scores
-    int_score_field="fineweb-mixtral-edu-score-int",      # Rounded integer scores
-    label_field="fineweb-mixtral-edu-score-label"          # "high_quality" or "low_quality"
-)
-pipeline.add_stage(classifier)
-
-# Save the results
-writer = JsonlWriter(path="mixtral_edu_classified/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-### Content Type Classifier
-
-Categorizes documents into 11 distinct speech types.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import ContentTypeClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="content_type_classification")
-
-# Load dataset
-reader = JsonlReader(
-    file_paths="content/",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the Content Type classifier
-classifier = ContentTypeClassifier(filter_by=["Blogs", "News"])
-pipeline.add_stage(classifier)
-
-# Save the results
-writer = JsonlWriter(path="content_type_classified/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-### Prompt Task and Complexity Classifier
-
-Classifies prompts by task type and complexity dimensions.
-
-```python
-from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.io.reader import JsonlReader
-from nemo_curator.stages.text.io.writer import JsonlWriter
-from nemo_curator.stages.text.classifiers import PromptTaskComplexityClassifier
-
-# Create pipeline
-pipeline = Pipeline(name="prompt_task_complexity_classification")
-
-# Load dataset
-reader = JsonlReader(
-    file_paths="prompts/",
-    fields=["text", "id"]
-)
-pipeline.add_stage(reader)
-
-# Apply the Prompt Task Complexity classifier
-classifier = PromptTaskComplexityClassifier()
-pipeline.add_stage(classifier)
-
-# Save the results
-writer = JsonlWriter(path="prompt_complexity_classified/")
-pipeline.add_stage(writer)
-
-# Execute pipeline
-results = pipeline.run()  # Uses XennaExecutor by default
-```
-
-## Custom Model Integration
-
-You can integrate your own classification models by extending `DistributedDataClassifier`. Refer to the [Text Classifiers README](https://github.com/NVIDIA-NeMo/Curator/tree/main/nemo_curator/stages/text/classifiers#text-classifiers) for implementation details and examples.
-
-## Performance Optimization
-
-NVIDIA NeMo Curator's distributed classifiers are optimized for high-throughput processing through several key features:
-
-### CPU-based tokenization and GPU-based model inference
-
-Each classifier is broken down under the hood into a tokenizer stage and a model inference stage. Tokenization is run on the CPU while model inference is run on the GPU. For example, this means that behind the scenes, the `DomainClassifier` stage is actually being broken down into 2 stages (some parameters and details omitted to avoid complexity):
-
-```python
-class TokenizerStage:
-    self.resources = Resources(cpus=1)
-    self.model_identifier = "nvidia/domain-classifier"
-    self.text_field = "text"
-    self.padding_side = "right"
-    ...
-class ModelStage:
-    self.resources = Resources(cpus=1, gpus=1)
-    self.model_identifier = "nvidia/domain-classifier"
-    self.model_inference_batch_size = 256
-    ...
-```
-
-Pipelines take care of resource allocation and autoscaling to achieve enhanced performance and minimize GPU idleness. This means that we are able to achieve speedups by ensuring that model inference is run in parallel across all available GPUs, while other stages such as I/O, tokenization, and filtering are run across all available CPUs. This is possible because Curator pipelines are composable, which allows each stage in a pipeline to run independently and with its own specified hardware resources.
-
-### Intelligent Batching and Sequence Handling
-
-The classifiers optimize throughput through:
-
-- **Length-based sorting**: Input sequences are sorted by length when `sort_by_length=True` (default)
-- **Efficient batching**: Similar-length sequences are grouped together to minimize padding overhead
-- **GPU memory optimization**: Batches are sized to maximize GPU utilization based on available memory
-
-### Avoid Unnecessary Re-Tokenization
-
-Several of the text classifiers use the same tokenizer before running the model forward pass. To avoid unnecessary re-tokenization, the `keep_tokens` and `use_existing_tokens` parameters can be used.
-
-**Important: Not every text classifier uses the same tokenizer, so it is important to confirm that classifiers' tokenizers are compatible with each other. Curator will not verify this for you.**
-
-The `ContentTypeClassifier`, `QualityClassifier`, `DomainClassifier`, and `PromptTaskComplexityClassifier` all use a DeBERTa tokenizer, which means that we only need to tokenize once. To avoid unnecessary re-tokenization, you can do:
-
-```python
-# Since this is the first classifier in the pipeline, there are no existing tokens to use,
-# but we can make sure to keep the computed tokens for the next classifier
-content_type_classifier = ContentTypeClassifier(use_existing_tokens=False, keep_tokens=True, ...)
-pipeline.add_stage(content_type_classifier)
-
-# Use tokens from the previous classifier and keep tokens for the next classifier
-quality_classifier = QualityClassifier(use_existing_tokens=True, keep_tokens=True, ...)
-pipeline.add_stage(quality_classifier)
-
-# Use tokens from the previous classifier and keep tokens for the next classifier
-domain_classifier = DomainClassifier(use_existing_tokens=True, keep_tokens=True, ...)
-pipeline.add_stage(domain_classifier)
-
-# Use tokens from the previous classifier
-# Since this is the final classifier in the pipeline, we drop the computed tokens
-prompt_task_complexity_classifier = PromptTaskComplexityClassifier(use_existing_tokens=True, keep_tokens=False, ...)
-pipeline.add_stage(prompt_task_complexity_classifier)
-```
-
-In addition to the above example, the `FineWebEduClassifier`, `FineWebMixtralEduClassifier`, and `FineWebNemotronEduClassifier` are all compatible with each other:
-
-```python
-fineweb_classifier = FineWebEduClassifier(use_existing_tokens=False, keep_tokens=True, ...)
-pipeline.add_stage(fineweb_classifier)
-
-fineweb_mixtral_classifier = FineWebMixtralEduClassifier(use_existing_tokens=True, keep_tokens=True, ...)
-pipeline.add_stage(fineweb_mixtral_classifier)
-
-fineweb_nemotron_classifier = FineWebNemotronEduClassifier(use_existing_tokens=True, keep_tokens=False, ...)
-pipeline.add_stage(fineweb_nemotron_classifier)
-```
-
-The `AegisClassifier` variants ([nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0)) are compatible with each other as well. This example is a bit more complex because it also involves keeping the formatted Aegis prompt field. See the `AegisClassifier` implementation for more details.
-
-```python
-aegis_defensive_classifier = AegisClassifier(
-    aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0",
-    label_field="aegis_defensive_pred",
-    use_existing_tokens=False,
-    keep_tokens=True,
-    keep_aegis_prompt_field=True,
-    ...
-)
-pipeline.add_stage(aegis_defensive_classifier)
-
-aegis_permissive_classifier = AegisClassifier(
-    aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0",
-    label_field="aegis_permissive_pred",
-    use_existing_tokens=True,
-    aegis_prompt_field="_curator_hidden_text",  # created by aegis_defensive_classifier
-    keep_tokens=False,
-    keep_aegis_prompt_field=False,
-    ...
-)
-pipeline.add_stage(aegis_permissive_classifier)
-```
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/filters/doc_filter.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-
-
-class DocumentFilter(ABC):
-    """
-    An abstract base class for text-based document filters.
-
-    This class serves as a template for creating specific document filters
-    in the library. Subclasses should implement the abstract methods to
-    define custom filtering behavior.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._name = self.__class__.__name__
-        self._sentences = None
-        self._paragraphs = None
-        self._ngrams = None
-
-    @abstractmethod
-    def score_document(self, text: str) -> float | list[int | float]:
-        """
-        Calculate a score for the given document text.
-
-        This method should be implemented by subclasses to define how
-        a document's text is evaluated and scored.
-
-        Args:
-            text (str): The text content of the document to be scored.
-
-        Returns:
-            Any: A score or set of scores representing the document's
-            relevance or quality. The type and structure of the
-            return value should be consistent for each subclass.
-
-        Raises:
-            NotImplementedError: If the method is not implemented in a subclass.
-        """
-        msg = "score_document method must be implemented by subclasses"
-        raise NotImplementedError(msg)
-
-    @abstractmethod
-    def keep_document(self, scores: float | list[int | float]) -> bool:
-        """
-        Determine whether to keep a document based on its scores.
-
-        This method should be implemented by subclasses to define the
-        criteria for keeping or discarding a document based on the
-        scores calculated by score_document().
-
-        Args:
-            scores (float | list[int | float]): The score or set of scores returned by score_document().
-                          The type should match what is returned by score_document().
-
-        Returns:
-            bool: True if the document should be kept, False otherwise.
-
-        Raises:
-            NotImplementedError: If the method is not implemented in a subclass.
-        """
-        msg = "keep_document method must be implemented by subclasses"
-        raise NotImplementedError(msg)
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @property
-    def sentences(self) -> list:
-        return self._sentences
-
-    @sentences.setter
-    def sentences(self, sentences: list) -> None:
-        self._sentences = sentences
-
-    @property
-    def paragraphs(self) -> list:
-        return self._paragraphs
-
-    @paragraphs.setter
-    def paragraphs(self, paragraphs: list) -> None:
-        self._paragraphs = paragraphs
-
-    @property
-    def ngrams(self) -> dict:
-        return self._ngrams
-
-    @ngrams.setter
-    def ngrams(self, ngrams: dict) -> None:
-        self._ngrams = ngrams
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/filters/score_filter.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections.abc import Callable
-from dataclasses import dataclass
-from typing import Any, Literal
-
-import pandas as pd
-from loguru import logger
-
-from nemo_curator.backends.base import NodeInfo, WorkerMetadata
-from nemo_curator.stages.base import ProcessingStage
-from nemo_curator.stages.text.filters.doc_filter import DocumentFilter
-from nemo_curator.tasks import DocumentBatch
-
-
-@dataclass
-class Score(ProcessingStage[DocumentBatch, DocumentBatch]):
-    """
-    The module responsible for adding metadata to records based on statistics about the text.
-    It accepts an arbitrary scoring function that accepts a text field and returns a score.
-    It also accepts a DocumentFilter object, in which case the score_fn will be the score_document method of the DocumentFilter.
-
-    Unlike ScoreFilter, it does not filter based on the computed score.
-    It only adds metadata to the record.
-
-    If a list of DocumentFilters is provided, the filters are applied in order.
-    In this case, the score_field parameter should be a list of strings corresponding to the filters.
-    If different filters should be applied to different text fields, then text_field should be a list of strings corresponding to the filters.
-
-    Args:
-        score_fn (Callable | DocumentFilter | list[DocumentFilter]): The score function or the DocumentFilter object (or list of DocumentFilters). If it is a DocumentFilter object, the score_fn will be the score_document method of the DocumentFilter.
-        score_field (str | list[str]): The field (or list of fields) the score will be stored in.
-        text_field (str | list[str]): The field (or list of fields) the documents will be read from.
-
-    """
-
-    score_fn: Callable[[str], float | str] | DocumentFilter | list[DocumentFilter]
-    score_field: str | list[str]
-    text_field: str | list[str] = "text"
-    name: str = "score_fn"
-
-    def __post_init__(self):
-        self.name, self.score_fn, self.text_field, _, self.score_field = _validate_and_normalize_filters(
-            self.score_fn, self.text_field, None, self.score_field, "score"
-        )
-
-    def inputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], self.text_field
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], self.text_field + self.score_field
-
-    def ray_stage_spec(self) -> dict[str, Any]:
-        requires_setup = any(
-            hasattr(score_fn, "load_model") or hasattr(score_fn, "load_tokenizer")
-            for score_fn in self.score_fn
-            if isinstance(score_fn, DocumentFilter)
-        )
-        return {"is_actor_stage": requires_setup}
-
-    def setup_on_node(
-        self,
-        _node_info: NodeInfo | None = None,
-        _worker_metadata: WorkerMetadata | None = None,
-    ) -> None:
-        for score_fn in self.score_fn:
-            if isinstance(score_fn, DocumentFilter) and hasattr(score_fn, "model_check_or_download"):
-                score_fn.model_check_or_download()
-
-    def setup(self, _: WorkerMetadata | None = None) -> None:
-        for score_fn in self.score_fn:
-            if isinstance(score_fn, DocumentFilter):
-                if hasattr(score_fn, "load_model"):
-                    score_fn.load_model()
-                if hasattr(score_fn, "load_tokenizer"):
-                    score_fn.load_tokenizer()
-
-    def process(self, batch: DocumentBatch) -> DocumentBatch | None:
-        """
-        Applies the scoring to a dataset
-
-        Args:
-            batch (DocumentBatch): The batch to apply the module to
-
-        Returns:
-            DocumentBatch: A batch with the new score
-
-        """
-        df = batch.to_pandas()
-
-        if df.empty:
-            logger.info(f"Empty dataset for batch {batch.task_id}")
-            return batch
-
-        for score_fn_i, text_field_i, score_field_i in zip(
-            self.score_fn, self.text_field, self.score_field, strict=True
-        ):
-            inner_score_fn = score_fn_i.score_document if isinstance(score_fn_i, DocumentFilter) else score_fn_i
-            df[score_field_i] = df[text_field_i].apply(inner_score_fn)
-
-        # Create output batch
-        return DocumentBatch(
-            task_id=f"{batch.task_id}_{self.name}",
-            dataset_name=batch.dataset_name,
-            data=df,
-            _metadata=batch._metadata,
-            _stage_perf=batch._stage_perf,
-        )
-
-
-@dataclass
-class Filter(ProcessingStage[DocumentBatch, DocumentBatch]):
-    """
-    The module responsible for filtering records based on a metadata field.
-    It accepts an arbitrary filter function that accepts a metadata field and returns True if the field should be kept.
-    It also accepts a DocumentFilter object, in which case the filter_fn will be the keep_document method of the DocumentFilter.
-    Unlike ScoreFilter, it does not compute the metadata based on a document.
-    It only filters using existing metadata.
-
-    If a list of DocumentFilters is provided, the filters are applied in order.
-    In this case, the filter_field parameter should be a list of strings corresponding to the filters.
-    If some filters should be inverted and others not, then invert should be a list of booleans corresponding to the filters.
-
-    Args:
-        filter_fn (Callable | DocumentFilter | list[DocumentFilter]): A function (or list of functions) that returns True if the document is to be kept or a DocumentFilter object,
-            in which case the filter_fn will be the keep_document method of the DocumentFilter.
-        filter_field (str | list[str]): The field (or list of fields) to be passed into the filter function.
-        invert (bool | list[bool]): Whether to invert the filter condition.
-
-    """
-
-    filter_fn: Callable | DocumentFilter | list[DocumentFilter]
-    filter_field: str | list[str]
-    invert: bool | list[bool] = False
-    name: str = "filter_fn"
-
-    def __post_init__(self):
-        self.name, self.filter_fn, self.filter_field, self.invert, _ = _validate_and_normalize_filters(
-            self.filter_fn, self.filter_field, self.invert, None, "filter"
-        )
-
-    def inputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], self.filter_field
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], self.filter_field
-
-    def compute_filter_mask(
-        self, df: pd.DataFrame, filter_fn: Callable | DocumentFilter, filter_field: str, invert: bool
-    ) -> pd.Series:
-        """Compute the bool mask to filter the dataset.
-
-        Args:
-            df (pd.DataFrame): The dataset to compute filter mask on.
-            filter_fn (Callable | DocumentFilter): The filter function to use.
-            filter_field (str): The field to read the filter from.
-            invert (bool): Whether to invert the filter condition.
-
-        Returns:
-            Series: A mask corresponding to each data instance indicating whether it will be retained.
-
-        """
-
-        if isinstance(filter_fn, DocumentFilter):
-            filter_fn = filter_fn.keep_document
-
-        bool_mask = df[filter_field].apply(filter_fn)
-
-        if invert:
-            bool_mask = ~bool_mask
-
-        return bool_mask
-
-    def process(self, batch: DocumentBatch) -> DocumentBatch | None:
-        """
-        Applies the filtering to a dataset
-
-        Args:
-            batch (DocumentBatch): The batch to apply the module to
-
-        Returns:
-            DocumentBatch: A batch with entries removed according to the filter
-
-        """
-        df = batch.to_pandas()
-
-        if df.empty:
-            logger.info(f"Empty dataset for batch {batch.task_id}")
-            return batch
-
-        for filter_fn_i, filter_field_i, invert_i in zip(self.filter_fn, self.filter_field, self.invert, strict=True):
-            bool_mask = self.compute_filter_mask(df, filter_fn_i, filter_field_i, invert_i)
-            df = df[bool_mask]
-
-        if len(df) == 0:
-            logger.info(f"All documents filtered out for batch {batch.task_id}")
-
-        # Create output batch
-        return DocumentBatch(
-            task_id=f"{batch.task_id}_{self.name}",
-            dataset_name=batch.dataset_name,
-            data=df,
-            _metadata=batch._metadata,
-            _stage_perf=batch._stage_perf,
-        )
-
-
-@dataclass
-class ScoreFilter(ProcessingStage[DocumentBatch, DocumentBatch]):
-    """
-    The module responsible for applying a filter (or chain of filters) to all documents in a dataset.
-    It accepts an arbitrary DocumentFilter and first computes the score for a document.
-    Then, determines whether to keep the document based on the criteria in the DocumentFilter.
-
-    The filter can be applied to any field in the dataset, and the score can be logged for later.
-    Also, the filter can be inverted such that "rejected" documents are kept.
-
-    If a list of DocumentFilters is provided, the filters are applied in order.
-    If different filters should be applied to different text fields, then text_field should be a list of strings corresponding to the filters.
-    If different score fields should be created for each filter, then score_field should be a list of strings corresponding to the filters.
-    If some filters should be inverted and others not, then invert should be a list of booleans corresponding to the filters.
-
-    Args:
-        filter_obj (DocumentFilter | list[DocumentFilter]): The score function (or list of score functions) that takes in a document string and outputs a score for the document.
-        text_field (str | list[str]): The field (or list of fields) the documents will be read from.
-        score_field (str | list[str] | None): The field (or list of fields) to which the scores will be written. If None, scores will be immediately discarded after use.
-        invert (bool | list[bool]): If True, will keep all documents that are normally discarded.
-
-    """
-
-    filter_obj: DocumentFilter | list[DocumentFilter]
-    text_field: str | list[str] = "text"
-    score_field: str | list[str] | None = None
-    invert: bool | list[bool] = False
-    name: str = "score_filter"
-
-    def __post_init__(self):
-        self.name, self.filter_obj, self.text_field, self.invert, self.score_field = _validate_and_normalize_filters(
-            self.filter_obj, self.text_field, self.invert, self.score_field, "score_filter"
-        )
-
-    def inputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], self.text_field
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], self.text_field + self.score_field if self.score_field is not None else []
-
-    def ray_stage_spec(self) -> dict[str, Any]:
-        requires_setup = any(
-            hasattr(filter_obj, "load_model") or hasattr(filter_obj, "load_tokenizer")
-            for filter_obj in self.filter_obj
-            if isinstance(filter_obj, DocumentFilter)
-        )
-        return {"is_actor_stage": requires_setup}
-
-    def setup_on_node(
-        self,
-        _node_info: NodeInfo | None = None,
-        _worker_metadata: WorkerMetadata | None = None,
-    ) -> None:
-        for filter_obj in self.filter_obj:
-            if isinstance(filter_obj, DocumentFilter) and hasattr(filter_obj, "model_check_or_download"):
-                filter_obj.model_check_or_download()
-
-    def setup(self, _: WorkerMetadata | None = None) -> None:
-        for filter_obj in self.filter_obj:
-            if isinstance(filter_obj, DocumentFilter):
-                if hasattr(filter_obj, "load_model"):
-                    filter_obj.load_model()
-                if hasattr(filter_obj, "load_tokenizer"):
-                    filter_obj.load_tokenizer()
-
-    def compute_filter_mask(
-        self, df: pd.DataFrame, filter_obj: DocumentFilter, text_field: str, score_field: str | None, invert: bool
-    ) -> pd.Series:
-        """Compute the bool mask to filter the dataset.
-
-        Args:
-            df (pd.DataFrame): The dataset to compute filter mask on.
-            filter_obj (DocumentFilter): The filter object to use.
-            text_field (str): The field to read the text from.
-            score_field (str | None): The field to write the scores to.
-            invert (bool): Whether to invert the filter condition.
-
-        Returns:
-            Series: A mask corresponding to each data instance indicating whether it will be retained.
-
-        """
-
-        scores = df[text_field].apply(filter_obj.score_document)
-
-        if score_field is not None:
-            df[score_field] = scores
-
-        bool_mask = scores.apply(filter_obj.keep_document)
-
-        if invert:
-            bool_mask = ~bool_mask
-
-        return bool_mask
-
-    def process(self, batch: DocumentBatch) -> DocumentBatch | None:
-        """
-        Scores and filters all records in the dataset
-
-        Args:
-            batch (DocumentBatch): The batch to apply the module to
-
-        Returns:
-            DocumentBatch: A batch with the score and filter applied
-
-        """
-        df = batch.to_pandas()
-
-        if df.empty:
-            logger.info(f"Empty dataset for batch {batch.task_id}")
-            return batch
-
-        for filter_obj_i, text_field_i, score_field_i, invert_i in zip(
-            self.filter_obj, self.text_field, self.score_field, self.invert, strict=True
-        ):
-            bool_mask = self.compute_filter_mask(df, filter_obj_i, text_field_i, score_field_i, invert_i)
-            df = df[bool_mask]
-
-        if len(df) == 0:
-            logger.info(f"All documents filtered out for batch {batch.task_id}")
-
-        # Create output batch
-        return DocumentBatch(
-            task_id=f"{batch.task_id}_{self.name}",
-            dataset_name=batch.dataset_name,
-            data=df,
-            _metadata=batch._metadata,
-            _stage_perf=batch._stage_perf,
-        )
-
-
-def _filter_name(x: DocumentFilter | Callable) -> str:
-    return x.name if isinstance(x, DocumentFilter) else x.__name__
-
-
-def _get_filter_stage_name(filters: list[DocumentFilter | Callable], prefix: str) -> str:
-    """
-    Derive the stage name from the provided score/filter functions.
-
-    """
-    return (
-        _filter_name(filters[0])
-        if len(filters) == 1
-        else f"{prefix}_chain_of_" + "_".join(_filter_name(f) for f in filters)
-    )
-
-
-def _format_single_field_list(
-    _field: str | list[str] | None, field_name: str, field_type: type = str
-) -> list[str] | list[bool]:
-    """
-    In the case of a single DocumentFilter or Callable, format the relevant field
-    (filter_field, score_field, text_field, invert) to a list of length 1.
-
-    Args:
-        _field (str | list[str] | None): The field to check and format.
-        field_name (str): The name of the field, which is used in error messages.
-        field_type (type): The type of the field, which is used in an isinstance check.
-
-    Returns:
-        list[str] | list[bool]: The reformatted field.
-
-    """
-    if isinstance(_field, list):
-        if len(_field) > 1:
-            msg = f"More {field_name} fields than functions provided: {_field}"
-            raise ValueError(msg)
-    elif isinstance(_field, field_type):
-        _field = [_field]
-    else:
-        msg = f"{field_name} field must be a {field_type} or list of {field_type}: {_field}"
-        raise TypeError(msg)
-
-    return _field
-
-
-def _format_field_list(
-    _field: str | list[str] | None, filter_count: int, field_name: str, field_type: type = str
-) -> list[str] | list[bool]:
-    """
-    In the case of a list of DocumentFilters or Callables, format the relevant field
-    (filter_field, score_field, text_field, invert) to a list of length equal to the number of filters.
-
-    Args:
-        _field (str | list[str] | None): The field to check and format.
-        filter_count (int): The number of filters. This will be the length of the output list.
-        field_name (str): The name of the field, which is used in error messages.
-        field_type (type): The type of the field, which is used in an isinstance check.
-
-    Returns:
-        list[str] | list[bool]: The reformatted field.
-
-    """
-    if isinstance(_field, list):
-        if len(_field) == 1:
-            logger.info(f"Using the same {field_name} field for all functions: {_field}")
-            _field = [_field] * filter_count
-        if len(_field) != filter_count:
-            msg = f"Number of {field_name} fields must match number of functions: {_field}"
-            raise ValueError(msg)
-    elif isinstance(_field, field_type):
-        logger.info(f"Using the same {field_name} field for all functions: {_field}")
-        _field = [_field] * filter_count
-    else:
-        msg = f"{field_name} field must be a {field_type} or list of {field_type}: {_field}"
-        raise TypeError(msg)
-
-    return _field
-
-
-def _validate_and_normalize_filters(  # noqa: C901, PLR0912
-    _filter: DocumentFilter | Callable | list[DocumentFilter | Callable],
-    input_field: str | list[str] | None,
-    invert: bool | list[bool] | None,
-    output_field: str | list[str] | None,
-    fn_type: Literal["score", "filter", "score_filter"],
-) -> tuple[str, list[DocumentFilter | Callable], list[str] | None, list[bool] | None, list[str] | None]:
-    """
-    Validate and normalize all parameters needed for the Score, Filter, and ScoreFilter modules.
-    "Normalize" means to reformat all parameters to a list of length equal to the number of filters.
-
-    Args:
-        _filter (DocumentFilter | Callable | list[DocumentFilter | Callable]): The filter object or list of filter objects.
-        input_field (str | list[str] | None): The input field. For Score and ScoreFilter, this is the text field. For Filter, this is the filter field.
-        invert (bool | list[bool] | None): The invert flag. This is used for Filter and ScoreFilter.
-        output_field (str | list[str] | None): The output field. For Score and ScoreFilter, this is the score field. For Filter, this is not used.
-        fn_type (Literal["score", "filter", "score_filter"]): The type of the module.
-
-    Returns:
-        tuple[str, list[DocumentFilter | Callable], list[str] | None, list[bool] | None, list[str] | None]:
-            The first string returned corresponds to the name given to the DocumentFilter or Callable.
-            The normalized filters, input fields, invert flags, and output fields make up the rest of the tuple.
-
-    """
-
-    # For Score and ScoreFilter, the input_field is the text field
-    # For Filter, the input_field is the filter field
-    input_field_name = "filter" if fn_type == "filter" else "text"
-    if input_field is None:
-        msg = f"{input_field_name}_field cannot be None"
-        raise ValueError(msg)
-
-    # Score is the only module that explicitly requires an output field,
-    # i.e., a score_field that is calculated by the DocumentFilter or Callable.
-    if output_field is None and fn_type == "score":
-        msg = "score_field cannot be None"
-        raise ValueError(msg)
-
-    if isinstance(_filter, DocumentFilter):
-        _name = _filter.name
-    elif isinstance(_filter, Callable):
-        _name = f"{fn_type}_fn"
-
-    if isinstance(_filter, (DocumentFilter, Callable)):
-        _normalized_filter = [_filter]
-        _input_field = _format_single_field_list(input_field, input_field_name, field_type=str)
-
-        if fn_type in ["filter", "score_filter"]:
-            _invert = _format_single_field_list(invert, "invert", field_type=bool)
-        else:
-            # Score does not use an invert flag
-            _invert = None
-
-        if fn_type in ["score", "score_filter"]:
-            # ScoreFilter is allowed to have no output fields, but Score is not
-            if output_field is None and fn_type == "score_filter":
-                _output_field = [None]
-            else:
-                _output_field = _format_single_field_list(output_field, "score", field_type=str)
-        else:
-            # Filter does not use an output field
-            _output_field = None
-
-    elif isinstance(_filter, list):
-        _name = _get_filter_stage_name(_filter, prefix=fn_type)
-        _normalized_filter = _filter
-
-        # Technically, you could run a list of filters on the same filter_field.
-        # However, prefer to use a list of fields to avoid confusion.
-        if fn_type == "filter" and (
-            isinstance(input_field, str) or (isinstance(input_field, list) and len(input_field) == 1)
-        ):
-            msg = f"filter_field must be a list of strings if multiple filters are used: {input_field}"
-            raise ValueError(msg)
-
-        _input_field = _format_field_list(input_field, len(_filter), input_field_name, field_type=str)
-
-        if fn_type in ["filter", "score_filter"]:
-            _invert = _format_field_list(invert, len(_filter), "invert", field_type=bool)
-        else:
-            # Score does not use an invert flag
-            _invert = None
-
-        if fn_type in ["score", "score_filter"]:
-            # ScoreFilter is allowed to have no output fields, but Score is not
-            if output_field is None and fn_type == "score_filter":
-                _output_field = [None] * len(_filter)
-            # Output fields are always required to be a (unique) list of strings.
-            # We check that here.
-            elif isinstance(output_field, str) or (isinstance(output_field, list) and len(output_field) == 1):
-                msg = f"score_field must be a list of strings if multiple filters are used: {output_field}"
-                raise ValueError(msg)
-            else:
-                _output_field = _format_field_list(output_field, len(_filter), "score", field_type=str)
-        else:
-            # Filter does not use an output field
-            _output_field = None
-
-    return _name, _normalized_filter, _input_field, _invert, _output_field
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/classifiers/base.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from dataclasses import dataclass
-from typing import Literal
-
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-import numpy as np
-import pandas as pd
-import torch
-from huggingface_hub import PyTorchModelHubMixin
-from torch import nn
-from transformers import AutoConfig, AutoModel
-
-from nemo_curator.stages.base import CompositeStage, ProcessingStage
-from nemo_curator.stages.text.filters import Filter
-from nemo_curator.stages.text.models.model import ModelStage
-from nemo_curator.stages.text.models.tokenizer import TokenizerStage
-from nemo_curator.stages.text.models.utils import ATTENTION_MASK_FIELD, INPUT_ID_FIELD
-from nemo_curator.tasks import DocumentBatch
-
-from .utils import SortByLengthStage
-
-
-class Deberta(nn.Module, PyTorchModelHubMixin):
-    """
-    Base PyTorch model where we add a classification head.
-
-    Args:
-        config: The configuration of the model.
-
-    """
-
-    def __init__(self, config: dataclass):
-        super().__init__()
-        self.model = AutoModel.from_pretrained(config["base_model"])
-        self.dropout = nn.Dropout(config["fc_dropout"])
-        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))
-
-    @property
-    def device(self) -> torch.device:
-        return next(self.parameters()).device
-
-    @torch.no_grad()
-    def forward(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
-        features = self.model(batch[INPUT_ID_FIELD], batch[ATTENTION_MASK_FIELD]).last_hidden_state
-        dropped = self.dropout(features)
-        outputs = self.fc(dropped)
-
-        del batch, features, dropped
-
-        return torch.softmax(outputs[:, 0, :], dim=1)
-
-
-class ClassifierModelStage(ModelStage):
-    """
-    Stage for Hugging Face model inference.
-
-    Args:
-        model_identifier: The identifier of the Hugging Face model.
-        label_field: The name of the prediction column.
-        score_field: The name of the probability column. Defaults to None.
-        model_inference_batch_size: The size of the batch for model inference. Defaults to 256.
-        has_seq_order: Whether to sort the input data by the length of the input tokens.
-            Sorting is encouraged to improve the performance of the inference model. Defaults to True.
-        padding_side: The side to pad the input tokens. Defaults to "right".
-        max_seq_length: If provided, clips the input tokens before the forward pass. Defaults to None.
-        autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference.
-            Defaults to True.
-        keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False.
-
-    """
-
-    def __init__(  # noqa: PLR0913
-        self,
-        model_identifier: str,
-        cache_dir: str | None = None,
-        label_field: str = "preds",
-        score_field: str | None = None,
-        model_inference_batch_size: int = 256,
-        has_seq_order: bool = True,
-        padding_side: Literal["left", "right"] = "right",
-        max_seq_length: int | None = None,
-        autocast: bool = True,
-        keep_tokens: bool = False,
-    ):
-        super().__init__(
-            model_identifier=model_identifier,
-            cache_dir=cache_dir,
-            has_seq_order=has_seq_order,
-            model_inference_batch_size=model_inference_batch_size,
-            padding_side=padding_side,
-            max_seq_length=max_seq_length,
-            unpack_inference_batch=False,
-            autocast=autocast,
-        )
-
-        self.label_field = label_field
-        if score_field is not None:
-            self.score_field = score_field
-            self.keep_score_field = True
-        else:
-            self.score_field = "probs"
-            self.keep_score_field = False
-
-        self.keep_tokens = keep_tokens
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        return ["data"], [self.label_field] + ([self.score_field] if self.keep_score_field else [])
-
-    def _setup(self, local_files_only: bool = True) -> None:
-        self.model = (
-            Deberta.from_pretrained(self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only)
-            .cuda()
-            .eval()
-        )
-
-        config = AutoConfig.from_pretrained(
-            self.model_identifier, cache_dir=self.cache_dir, local_files_only=local_files_only
-        )
-        self.labels = list(config.label2id.keys())
-        self.labels.sort(key=lambda x: config.label2id[x])
-
-    def process_model_output(
-        self, outputs: torch.Tensor, _: dict[str, torch.Tensor] | None = None
-    ) -> dict[str, np.ndarray]:
-        probs = outputs.cpu().numpy()
-        preds = np.argmax(probs, axis=1)
-
-        pred_labels = [self.labels[idx] for idx in preds]
-
-        return {
-            self.score_field: probs,
-            self.label_field: np.array(pred_labels),
-        }
-
-    def create_output_dataframe(self, df_cpu: pd.DataFrame, collected_output: dict[str, np.ndarray]) -> pd.DataFrame:
-        if not self.keep_tokens:
-            df_cpu = df_cpu.drop(columns=[INPUT_ID_FIELD, ATTENTION_MASK_FIELD])
-
-        df_cpu[self.label_field] = collected_output[self.label_field]
-
-        if self.keep_score_field:
-            df_cpu[self.score_field] = collected_output[self.score_field].tolist()
-
-        return df_cpu
-
-
-@dataclass(kw_only=True)
-class DistributedDataClassifier(CompositeStage[DocumentBatch, DocumentBatch]):
-    """
-    Base composite stage for distributed data classification.
-
-    It decomposes into a tokenizer stage and a model stage.
-
-    Args:
-        model_identifier: The identifier of the Hugging Face model.
-        cache_dir: The Hugging Face cache directory. Defaults to None.
-        label_field: The name of the prediction column. Defaults to "preds".
-        score_field: The name of the probability column. Defaults to None.
-        text_field: The name of the text field in the input data. Defaults to "text".
-        filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None.
-        max_chars: Limits the total number of characters that can be fed to the tokenizer.
-            If None, text will not be truncated. Defaults to None.
-        max_seq_length: Limits the total sequence returned by the tokenizer so that it has a maximum length.
-            If None, the tokenizer's model_max_length is used. Defaults to 512.
-        padding_side: The side to pad the input tokens. Defaults to "right".
-        sort_by_length: Whether to sort the input data by the length of the input tokens.
-            Sorting is encouraged to improve the performance of the inference model. Defaults to True.
-        model_inference_batch_size: The size of the batch for model inference. Defaults to 256.
-        autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference.
-            Defaults to True.
-        keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False.
-        use_existing_tokens: Whether to use the existing tokens from the input dataframe.
-            If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization.
-            Defaults to False.
-
-    """
-
-    model_identifier: str
-    cache_dir: str | None = None
-    label_field: str = "preds"
-    score_field: str | None = None
-    text_field: str = "text"
-    filter_by: list[str] | None = None
-    max_chars: int | None = None
-    max_seq_length: int | None = None
-    padding_side: Literal["left", "right"] = "right"
-    sort_by_length: bool = True
-    model_inference_batch_size: int = 256
-    autocast: bool = True
-    keep_tokens: bool = False
-    use_existing_tokens: bool = False
-
-    def __post_init__(self) -> None:
-        super().__init__()
-
-        self.stages = []
-
-        if not self.use_existing_tokens:
-            tokenizer_stage = TokenizerStage(
-                model_identifier=self.model_identifier,
-                cache_dir=self.cache_dir,
-                text_field=self.text_field,
-                max_chars=self.max_chars,
-                max_seq_length=self.max_seq_length,
-                padding_side=self.padding_side,
-                sort_by_length=self.sort_by_length,
-            )
-            self.stages.append(tokenizer_stage)
-            # The TokenizerStage already truncates to the max_seq_length, so the ModelStage does not need to do it again
-            model_max_seq_length = None
-        else:
-            # The ModelStage will truncate to the max_seq_length before the forward pass
-            model_max_seq_length = self.max_seq_length
-
-        # Ensure that the data is sorted by length if the tokens are already present and sort_by_length is True
-        if self.use_existing_tokens and self.sort_by_length:
-            sort_by_length_stage = SortByLengthStage()
-            self.stages.append(sort_by_length_stage)
-
-        model_stage = ClassifierModelStage(
-            model_identifier=self.model_identifier,
-            cache_dir=self.cache_dir,
-            label_field=self.label_field,
-            score_field=self.score_field,
-            model_inference_batch_size=self.model_inference_batch_size,
-            has_seq_order=self.sort_by_length,
-            padding_side=self.padding_side,
-            max_seq_length=model_max_seq_length,
-            autocast=self.autocast,
-            keep_tokens=self.keep_tokens,
-        )
-        self.stages.append(model_stage)
-
-        if self.filter_by is not None and len(self.filter_by) > 0:
-            self.stages.append(Filter(filter_fn=self.filter_by_category, filter_field=self.label_field))
-
-    def inputs(self) -> tuple[list[str], list[str]]:
-        return self.stages[0].inputs()
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        return self.stages[-1].outputs()
-
-    def filter_by_category(self, value: str) -> bool:
-        return value in self.filter_by
-
-    def decompose(self) -> list[ProcessingStage]:
-        return self.stages
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/classifiers/domain.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-from nemo_curator.stages.text.models.utils import format_name_with_suffix
-
-from .base import DistributedDataClassifier
-from .utils import DEBERTA_TOKENIZER_PADDING_SIDE
-
-DOMAIN_MODEL_IDENTIFIER = "nvidia/domain-classifier"
-MULTILINGUAL_DOMAIN_MODEL_IDENTIFIER = "nvidia/multilingual-domain-classifier"
-MAX_SEQ_LENGTH = 512
-
-
-class DomainClassifier(DistributedDataClassifier):
-    """
-    DomainClassifier is a specialized classifier designed for English text domain classification tasks,
-    utilizing the NemoCurator Domain Classifier (https://huggingface.co/nvidia/domain-classifier) model.
-    This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets.
-
-    Attributes:
-        cache_dir: The Hugging Face cache directory. Defaults to None.
-        label_field: The name of the prediction column. Defaults to "domain_pred".
-        score_field: The name of the probability column. Defaults to None.
-        text_field: The name of the text field in the input data. Defaults to "text".
-        filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None.
-        max_chars: The maximum number of characters to use from the input text. Defaults to 2000.
-        sort_by_length: Whether to sort the input data by the length of the input tokens.
-            Sorting is encouraged to improve the performance of the inference model. Defaults to True.
-        model_inference_batch_size: The size of the batch for model inference. Defaults to 256.
-        autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference.
-            Defaults to True.
-        keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False.
-        use_existing_tokens: Whether to use the existing tokens from the input dataframe.
-            If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization.
-            Defaults to False.
-
-    """
-
-    def __init__(  # noqa: PLR0913
-        self,
-        cache_dir: str | None = None,
-        label_field: str = "domain_pred",
-        score_field: str | None = None,
-        text_field: str = "text",
-        filter_by: list[str] | None = None,
-        max_chars: int = 2000,
-        sort_by_length: bool = True,
-        model_inference_batch_size: int = 256,
-        autocast: bool = True,
-        keep_tokens: bool = False,
-        use_existing_tokens: bool = False,
-    ):
-        super().__init__(
-            model_identifier=DOMAIN_MODEL_IDENTIFIER,
-            cache_dir=cache_dir,
-            label_field=label_field,
-            score_field=score_field,
-            text_field=text_field,
-            filter_by=filter_by,
-            max_chars=max_chars,
-            max_seq_length=MAX_SEQ_LENGTH,
-            padding_side=DEBERTA_TOKENIZER_PADDING_SIDE,
-            sort_by_length=sort_by_length,
-            model_inference_batch_size=model_inference_batch_size,
-            autocast=autocast,
-            keep_tokens=keep_tokens,
-            use_existing_tokens=use_existing_tokens,
-        )
-
-        self.name = format_name_with_suffix(DOMAIN_MODEL_IDENTIFIER)
-
-
-class MultilingualDomainClassifier(DistributedDataClassifier):
-    """
-    MultilingualDomainClassifier is a specialized classifier designed for domain classification tasks,
-    utilizing the NemoCurator Multilingual Domain Classifier (https://huggingface.co/nvidia/multilingual-domain-classifier) model.
-    It supports domain classification across 52 languages.
-    This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets.
-
-    Attributes:
-        cache_dir: The Hugging Face cache directory. Defaults to None.
-        label_field: The name of the prediction column. Defaults to "multilingual_domain_pred".
-        score_field: The name of the probability column. Defaults to None.
-        text_field: The name of the text field in the input data. Defaults to "text".
-        filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None.
-        max_chars: The maximum number of characters to use from the input text. Defaults to 2000.
-        sort_by_length: Whether to sort the input data by the length of the input tokens.
-            Sorting is encouraged to improve the performance of the inference model. Defaults to True.
-        model_inference_batch_size: The size of the batch for model inference. Defaults to 256.
-        autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference.
-            Defaults to True.
-        keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False.
-        use_existing_tokens: Whether to use the existing tokens from the input dataframe.
-            If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization.
-            Defaults to False.
-
-    """
-
-    def __init__(  # noqa: PLR0913
-        self,
-        cache_dir: str | None = None,
-        label_field: str = "multilingual_domain_pred",
-        score_field: str | None = None,
-        text_field: str = "text",
-        filter_by: list[str] | None = None,
-        max_chars: int = 2000,
-        sort_by_length: bool = True,
-        model_inference_batch_size: int = 256,
-        autocast: bool = True,
-        keep_tokens: bool = False,
-        use_existing_tokens: bool = False,
-    ):
-        super().__init__(
-            model_identifier=MULTILINGUAL_DOMAIN_MODEL_IDENTIFIER,
-            cache_dir=cache_dir,
-            label_field=label_field,
-            score_field=score_field,
-            text_field=text_field,
-            filter_by=filter_by,
-            max_chars=max_chars,
-            max_seq_length=MAX_SEQ_LENGTH,
-            padding_side=DEBERTA_TOKENIZER_PADDING_SIDE,
-            sort_by_length=sort_by_length,
-            model_inference_batch_size=model_inference_batch_size,
-            autocast=autocast,
-            keep_tokens=keep_tokens,
-            use_existing_tokens=use_existing_tokens,
-        )
-
-        self.name = format_name_with_suffix(MULTILINGUAL_DOMAIN_MODEL_IDENTIFIER)
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/stages/text/classifiers/quality.py
-```py
-# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-from nemo_curator.stages.text.models.utils import format_name_with_suffix
-
-from .base import DistributedDataClassifier
-from .utils import DEBERTA_TOKENIZER_PADDING_SIDE
-
-QUALITY_CLASSIFIER_MODEL_IDENTIFIER = "nvidia/quality-classifier-deberta"
-MAX_SEQ_LENGTH = 1024
-
-
-class QualityClassifier(DistributedDataClassifier):
-    """
-    QualityClassifier is a specialized classifier designed for quality assessment tasks,
-    utilizing the NemoCurator Quality Classifier DeBERTa model (https://huggingface.co/nvidia/quality-classifier-deberta).
-    This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large datasets.
-
-    Attributes:
-        cache_dir: The Hugging Face cache directory. Defaults to None.
-        label_field: The name of the prediction column. Defaults to "quality_pred".
-        score_field: The name of the probability column. Defaults to None.
-        text_field: The name of the text field in the input data. Defaults to "text".
-        filter_by: For categorical classifiers, the list of labels to filter the data by. Defaults to None.
-        max_chars: Limits the total number of characters that can be fed to the tokenizer.
-            If None, text will not be truncated. Defaults to 6000.
-        sort_by_length: Whether to sort the input data by the length of the input tokens.
-            Sorting is encouraged to improve the performance of the inference model. Defaults to True.
-        model_inference_batch_size: The size of the batch for model inference. Defaults to 256.
-        autocast: Whether to use autocast. When True, we trade off minor accuracy for faster inference.
-            Defaults to True.
-        keep_tokens: Whether to keep the input tokens in the output dataframe. Defaults to False.
-        use_existing_tokens: Whether to use the existing tokens from the input dataframe.
-            If True, assume the relevant token fields are ["input_ids", "attention_mask"] and skip tokenization.
-            Defaults to False.
-
-    """
-
-    def __init__(  # noqa: PLR0913
-        self,
-        cache_dir: str | None = None,
-        label_field: str = "quality_pred",
-        score_field: str | None = None,
-        text_field: str = "text",
-        filter_by: list[str] | None = None,
-        max_chars: int = 6000,
-        sort_by_length: bool = True,
-        model_inference_batch_size: int = 256,
-        autocast: bool = True,
-        keep_tokens: bool = False,
-        use_existing_tokens: bool = False,
-    ):
-        super().__init__(
-            model_identifier=QUALITY_CLASSIFIER_MODEL_IDENTIFIER,
-            cache_dir=cache_dir,
-            label_field=label_field,
-            score_field=score_field,
-            text_field=text_field,
-            filter_by=filter_by,
-            max_chars=max_chars,
-            max_seq_length=MAX_SEQ_LENGTH,
-            padding_side=DEBERTA_TOKENIZER_PADDING_SIDE,
-            sort_by_length=sort_by_length,
-            model_inference_batch_size=model_inference_batch_size,
-            autocast=autocast,
-            keep_tokens=keep_tokens,
-            use_existing_tokens=use_existing_tokens,
-        )
-
-        self.name = format_name_with_suffix(QUALITY_CLASSIFIER_MODEL_IDENTIFIER)
-
-```
-
-File: /Users/mromeijn/src/Curator/nemo_curator/config/text/heuristic_filter_english_pipeline.yaml
-```yaml
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-defaults:
-  - _self_
-  - override hydra/job_logging: none
-  - override hydra/hydra_logging: none
-
-hydra:
-  run:
-    dir: .
-  output_subdir: null
-
-documentation: |
-  NeMo Curator Pipeline English Heuristic Filter Configuration File
-  #################################################################
-  This configuration file can be used to build a NeMo Curator pipeline that filters English text.
-  This example reads the input files, runs the heuristic filters, and saves the results.
-
-  The filters below define a chain of heuristic filters to be applied to each document in a corpus.
-  This particular cascade of filters is intended to filter English language data.
-  The filter listed at the top will be applied first, and the following filters will be applied in
-  the order they appear in this file. Each filter can be removed and re-ordered as desired.
-
-  To customize your own pipeline, you can add or remove stages from the stages list,
-  where _target_ is the stage class and includes all the parameters for the stage.
-
-input_path: ???
-output_path: ???
-text_field: text
-
-stages:
-  - _target_: nemo_curator.stages.text.io.reader.JsonlReader
-    file_paths: ${input_path}
-    files_per_partition: null
-    blocksize: null
-    fields: null
-
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.NonAlphaNumericFilter
-      max_non_alpha_numeric_to_text_ratio: 0.25
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.SymbolsToWordsFilter
-      max_symbol_to_word_ratio: 0.1
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.NumbersFilter
-      max_number_to_text_ratio: 0.15
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.UrlsFilter
-      max_url_to_text_ratio: 0.2
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.WhiteSpaceFilter
-      max_white_space_ratio: 0.25
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.ParenthesesFilter
-      max_parentheses_ratio: 0.1
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.BoilerPlateStringFilter
-      remove_if_at_top_or_bottom: True
-      max_boilerplate_string_ratio: 0.4
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedLinesFilter
-      max_repeated_line_fraction: 0.7
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedParagraphsFilter
-      max_repeated_paragraphs_ratio: 0.7
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedLinesByCharFilter
-      max_repeated_lines_char_ratio: 0.8
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatedParagraphsByCharFilter
-      max_repeated_paragraphs_char_ratio: 0.8
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.WordCountFilter
-      min_words: 50
-      max_words: 100000
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.PunctuationFilter
-      max_num_sentences_without_endmark_ratio: 0.85
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.WordsWithoutAlphabetsFilter
-      min_words_with_alphabets: 0.8
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.CommonEnglishWordsFilter
-      min_num_common_words: 2
-      stop_at_false: True
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.MeanWordLengthFilter
-      max_mean_word_length: 10
-      min_mean_word_length: 3
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.LongWordFilter
-      max_word_length: 1000
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.EllipsisFilter
-      max_num_lines_ending_with_ellipsis_ratio: 0.3
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Top N-Gram filters for N-gram 2
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingTopNGramsFilter
-      n: 2
-      max_repeating_ngram_ratio: 0.2
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Top N-Gram filters for N-gram 3
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingTopNGramsFilter
-      n: 3
-      max_repeating_ngram_ratio: 0.18
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Top N-Gram filters for N-gram 4
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingTopNGramsFilter
-      n: 4
-      max_repeating_ngram_ratio: 0.16
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Duplicate N-gram filters for N-gram 5
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter
-      n: 5
-      max_repeating_duplicate_ngram_ratio: 0.15
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Duplicate N-gram filters for N-gram 6
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter
-      n: 6
-      max_repeating_duplicate_ngram_ratio: 0.14
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Duplicate N-gram filters for N-gram 7
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter
-      n: 7
-      max_repeating_duplicate_ngram_ratio: 0.13
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Duplicate N-gram filters for N-gram 8
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter
-      n: 8
-      max_repeating_duplicate_ngram_ratio: 0.12
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Duplicate N-gram filters for N-gram 9
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter
-      n: 9
-      max_repeating_duplicate_ngram_ratio: 0.11
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      # Duplicate N-gram filters for N-gram 10
-      _target_: nemo_curator.stages.text.filters.heuristic.repetition.RepeatingDuplicateNGramsFilter
-      n: 10
-      max_repeating_duplicate_ngram_ratio: 0.10
-    text_field: ${text_field}
-    score_field: null
-  - _target_: nemo_curator.stages.text.filters.score_filter.ScoreFilter
-    filter_obj:
-      _target_: nemo_curator.stages.text.filters.heuristic.string.BulletsFilter
-      max_bullet_lines_ratio: 0.9
-    text_field: ${text_field}
-    score_field: null
-
-  - _target_: nemo_curator.stages.text.io.writer.JsonlWriter
-    path: ${output_path}
-    fields: null
-
-```
-
-File: /Users/mromeijn/src/Curator/tutorials/text/distributed-data-classification/README.md
-```md
-# Distributed Data Classification
-
-The following is a set of Jupyter notebook tutorials which demonstrate how to use various text classification models supported by NeMo Curator.
-The goal of using these classifiers is to help with data annotation, which is useful in data blending for foundation model training.
-
-Each of these classifiers are available on Hugging Face and can be run independently with the [Transformers](https://github.com/huggingface/transformers) library.
-By running them with NeMo Curator, the classifiers are accelerated using a heterogenous pipeline setup where tokenization is run across CPUs and model inference is run across GPUs.
-Each of the Jupyter notebooks in this directory demonstrate how to run the classifiers on text data and are easily scalable to large amounts of data.
-
-Before running any of these notebooks, see this [Installation Guide](https://docs.nvidia.com/nemo/curator/latest/admin/installation.html#admin-installation) page for instructions on how to install NeMo Curator. Be sure to use an installation method which includes GPU dependencies.
-
-For more information about the classifiers, refer to our [Distributed Data Classification](https://docs.nvidia.com/nemo/curator/latest/curate-text/process-data/quality-assessment/distributed-classifier.html) documentation page.
-
-## List of Classifiers
-
-<div align="center">
-
-| NeMo Curator Classifier | Description | Hugging Face Page |
-| --- | --- | --- |
-| `AegisClassifier` | Identify and categorize unsafe content per document | [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0) and [nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0](https://huggingface.co/nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0) |
-| `ContentTypeClassifier` | Categorize the type-of-speech per document | [nvidia/content-type-classifier-deberta](https://huggingface.co/nvidia/content-type-classifier-deberta) |
-| `DomainClassifier` | Categorize the domain per document | [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier) |
-| `FineWebEduClassifier` | Determine the educational value per document; this model was trained using annotations from Llama 3 70B-Instruct | [HuggingFaceFW/fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) |
-| `FineWebMixtralEduClassifier` | Determine the educational value per document; this model was trained using annotations from Mixtral 8x22B-Instruct | [nvidia/nemocurator-fineweb-mixtral-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier) |
-| `FineWebNemotronEduClassifier` | Determine the educational value per document; this model was trained using annotations from Nemotron-4-340B-Instruct | [nvidia/nemocurator-fineweb-nemotron-4-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier) |
-| `InstructionDataGuardClassifier` | Identify LLM poisoning attacks per document | [nvidia/instruction-data-guard](https://huggingface.co/nvidia/instruction-data-guard) |
-| `MultilingualDomainClassifier` | Categorize the domain per document; supports classification in 52 languages | [nvidia/multilingual-domain-classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) |
-| `PromptTaskComplexityClassifier` | Classifies text prompts across task types and complexity dimensions | [nvidia/prompt-task-and-complexity-classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) |
-| `QualityClassifier` | Categorize documents as high, medium, or low quality | [quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) |
-
-</div>
-
-Note that all classifiers support English text classification only, except the `MultilingualDomainClassifier`.
-
-## Bring Your Own Classifier
-
-Advanced users may want to integrate their own Hugging Face classifier(s) into NeMo Curator. Broadly, this requires creating a `CompositeStage` consisting of a CPU-based tokenizer stage and a GPU-based model inference stage. Refer to the [Text Classifiers README](https://github.com/NVIDIA-NeMo/Curator/tree/main/nemo_curator/stages/text/classifiers#text-classifiers) for details about how to do this.
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/eval-deploy-formats.txt b/skills/nemotron-customize/context/eval-deploy-formats.txt
deleted file mode 100644
index a417d966a..000000000
--- a/skills/nemotron-customize/context/eval-deploy-formats.txt
+++ /dev/null
@@ -1,743 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Evaluator
-├── docs
-│   └── deployment
-│       ├── launcher-orchestrated
-│       │   └── index.md *
-│       └── nemo-fw
-│           ├── hf.md *
-│           ├── index.md *
-│           └── mbridge.md *
-└── packages
-    └── nemo-evaluator-launcher
-        ├── examples
-        │   ├── local_nim.yaml *
-        │   ├── local_vllm_logprobs.yaml *
-        │   └── slurm_nim.yaml *
-        └── src
-            └── nemo_evaluator_launcher
-                ├── configs
-                │   ├── deployment
-                │   │   ├── nim.yaml *
-                │   │   └── vllm.yaml *
-                │   └── default.yaml *
-                └── resources
-                    └── config_templates
-                        └── deployment
-                            ├── nim.yaml *
-                            └── vllm.yaml *
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Evaluator/docs/deployment/nemo-fw/index.md
-```md
-(deployment-nemo-fw)=
-# Deploy and Evaluate Checkpoints Trained by NeMo Framework
-
-The NeMo Framework is NVIDIA’s GPU-accelerated, end-to-end training platform for large language models (LLMs), multimodal models, and speech models. It enables seamless scaling of both pretraining and post-training workloads, from a single GPU to clusters with thousands of nodes, supporting Hugging Face/PyTorch and Megatron models. NeMo includes a suite of libraries and curated training recipes to help users build models from start to finish.
-
-The NeMo Evaluator is integrated within NeMo Framework, offering streamlined deployment and advanced evaluation capabilities for models trained using NeMo, leveraging state-of-the-art evaluation harnesses.
-
-## Features
-
-- **Multi-Backend Deployment**: Supports PyTriton and multi-instance evaluations using the Ray Serve deployment backend
-- **Production-Ready**: Supports high-performance inference with CUDA graphs and flash decoding for Megatron models, vLLM backend for Hugging Face models and TRTLLM engine for TRTLLM models
-- **Multi-GPU and Multi-Node Support**: Enables distributed inference across multiple GPUs and compute nodes
-- **OpenAI-Compatible API**: Provides RESTful endpoints aligned with OpenAI API specifications
-
-## Architecture
-
-### 1. Deployment Layer
-
-- **PyTriton Backend**: Provides high-performance inference through the NVIDIA Triton Inference Server, with OpenAI API compatibility via a FastAPI interface. Supports model parallelism across single-node and multi-node configurations. Note: Multi-instance evaluation is not supported.
-- **Ray Backend**: Enables multi-instance evaluation with model parallelism on a single node using Ray Serve, while maintaining OpenAI API compatibility. Multi-node support is coming soon.
-
-For more information on the deployment, please see [NeMo Export-Deploy](https://github.com/NVIDIA-NeMo/Export-Deploy).
-
-### 2. Evaluation Layer
-
-- **NeMo Evaluator**: Provides standardized benchmark evaluations using packages from NVIDIA Eval Factory, bundled in the NeMo Framework container. The `lm-evaluation-harness` is pre-installed by default, and additional evaluation packages can be added as needed. For more information, see {ref}`core-wheels` and {ref}`lib-core`.
-
-
-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-Introduction <self>
-PyTriton Serving Backend <pytriton>
-Ray Serving Backend <ray>
-Evaluate Megatron Bridge Checkpoints <mbridge>
-Evaluate Automodel Checkpoints <hf>
-Evaluate TRTLLM Checkpoints <trtllm>
-```
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/deployment/nemo-fw/hf.md
-```md
-# Evaluate Automodel Checkpoints Trained by NeMo Framework
-
-This guide provides step-by-step instructions for evaluating checkpoints trained using the NeMo Framework with the Automodel backend. This section specifically covers evaluation with [nvidia-lm-eval](https://pypi.org/project/nvidia-lm-eval/), a wrapper around the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) tool.
-
-Here, we focus on benchmarks within the `lm-evaluation-harness` that depend on text generation. Evaluation on log-probability-based benchmarks is available in [Evaluate Automodel Checkpoints on Log-probability benchmarks](#evaluate-automodel-checkpoints-on-log-probability-benchmarks).
-
-## Deploy Automodel Checkpoints
-
-This section outlines the steps to deploy Automodel checkpoints using Python commands.
-
-Automodel checkpoint deployment uses Ray Serve as the serving backend. It also offers an OpenAI API (OAI)-compatible endpoint, similar to deployments of checkpoints trained with the Megatron Core backend. An example deployment command is shown below.
-
-```{literalinclude} _snippets/deploy_hf.sh
-:language: bash
-:start-after: "# [snippet-start]"
-:end-before: "# [snippet-end]"
-```
-
-The `--model_path` can refer to either a local checkpoint path or a Hugging Face model ID, as shown in the example above. In the example above, checkpoint deployment uses the `vLLM` backend. To enable accelerated inference, install `vLLM` in your environment. To install `vLLM` inside the NeMo Framework container, follow the steps below as shared in [Export-Deploy's README](https://github.com/NVIDIA-NeMo/Export-Deploy?tab=readme-ov-file#install-tensorrt-llm-vllm-or-trt-onnx-backend:~:text=cd%20/opt/export%2ddeploy%0auv%20sync%20%2d%2dinexact%20%2d%2dlink%2dmode%20symlink%20%2d%2dlocked%20%2d%2dextra%20vllm%20%24(cat%20/opt/uv_args.txt)):
-
-```shell
-cd /opt/Export-Deploy
-uv sync --inexact --link-mode symlink --locked --extra vllm $(cat /opt/uv_args.txt)
-```
-
-To install `vLLM` outside of the NeMo Framework container, follow the steps mentioned [here](https://github.com/NVIDIA-NeMo/Export-Deploy?tab=readme-ov-file#install-tensorrt-llm-vllm-or-trt-onnx-backend:~:text=install%20transformerengine%20%2b%20vllm).
-
-:::{note}
-25.11 release of NeMo Framework container comes with `vLLM` pre-installed and its not necessary to explicitly install it. However for all previous releases, please refer to the instructions above to install `vLLM` inside the NeMo Framework container.
-:::
-
-If you prefer to evaluate the Automodel checkpoint without using the `vLLM` backend, remove the `--use_vllm_backend` flag from the command above.
-
-:::{note}
-To speed up evaluation using multiple instances, increase the `num_replicas` parameter.
-For additional guidance, refer to {ref}`nemo-fw-ray`.
-:::
-
-## Evaluate Automodel Checkpoints
-
-This section outlines the steps to evaluate Automodel checkpoints using Python commands. This method is quick and easy, making it ideal for interactive evaluations. 
-
-Once deployment is successful, you can run evaluations using the {ref}`lib-core` API.
-
-Before starting the evaluation, it’s recommended to use the [`check_endpoint`](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator/src/nemo_evaluator/core/utils.py) function to verify that the endpoint is responsive and ready to accept requests.
-
-```{literalinclude} _snippets/mmlu.py
-:language: python
-:start-after: "## Run the evaluation"
-```
-
-## Evaluate Automodel Checkpoints on Log-probability Benchmarks
-
-To evaluate Automodel checkpoints on benchmarks that require log-probabilities, use the same deployment command provided in [Deploy Automodel Checkpoints](#deploy-automodel-checkpoints). These benchmarks are supported by both the `vLLM` backend (enabled via the `--use_vllm_backend` flag) and by directly deploying the Automodel checkpoint.
-
-For evaluation, you must specify the path to the `tokenizer` and set the `tokenizer_backend` parameter as shown below. The `tokenizer` files are located within the checkpoint directory.
-
-```{literalinclude} _snippets/arc_challenge_hf.py
-:language: python
-:start-after: "## Run the evaluation"
-```
-
-## Evaluate Automodel Checkpoints on Chat Benchmarks
-
-To evaluate Automodel checkpoints on chat benchmarks you need the chat endpoint (`/v1/chat/completions/`). The deployment command provided in [Deploy Automodel Checkpoints](#deploy-automodel-checkpoints) also exposes the chat endpoint, and the same command can be used for evaluating on chat benchmarks.
-
-For evaluation, update the URL by replacing `/v1/completions/` with `/v1/chat/completions/` as shown below. Additionally, set the `type` field to `"chat"` to indicate a chat benchmark.
-
-```{literalinclude} _snippets/ifeval.py
-:language: python
-:start-after: "## Run the evaluation"
-```
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/deployment/nemo-fw/mbridge.md
-```md
-# Evaluate Megatron Bridge Checkpoints Trained by NeMo Framework
-
-This guide provides step-by-step instructions for evaluating [Megatron Bridge](https://docs.nvidia.com/nemo/megatron-bridge/latest/index.html) checkpoints trained using the NeMo Framework with the Megatron Core backend. This section specifically covers evaluation with [nvidia-lm-eval](https://pypi.org/project/nvidia-lm-eval/), a wrapper around the [
-lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main) tool.
-
-First, we focus on benchmarks within the `lm-evaluation-harness` that depend on text generation. Evaluation on log-probability-based benchmarks is available in the subsequent section [Evaluate Megatron Bridge Checkpoints on Log-probability benchmarks](#evaluate-megatron-bridge-checkpoints-on-log-probability-benchmarks).
-
-## Deploy Megatron Bridge Checkpoints
-
-To evaluate a checkpoint saved during pretraining or fine-tuning with [Megatron-Bridge](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html), provide the path to the saved checkpoint using the `--megatron_checkpoint` flag in the deployment command below. Otherwise, Hugging Face checkpoints can be converted to Megatron Bridge using the single shell command:
-
-```bash
-huggingface-cli login --token <your token>
-python -c "from megatron.bridge import AutoBridge; AutoBridge.import_ckpt('meta-llama/Meta-Llama-3-8B','/workspace/mbridge_llama3_8b/')"
-```
-
-The deployment scripts are available inside the [`/opt/Export-Deploy/scripts/deploy/nlp/`](https://github.com/NVIDIA-NeMo/Export-Deploy/tree/main/scripts/deploy/nlp) directory. Below is an example command for deployment. It uses a Hugging Face LLaMA 3 8B checkpoint that has been converted to Megatron Bridge format using the command shared above.
-
-```{literalinclude} _snippets/deploy_mbridge.sh
-:language: bash
-:start-after: "# [snippet-start]"
-:end-before: "# [snippet-end]"
-```
-
-:::{note}
-Megatron Bridge creates checkpoints in directories named `iter_N`, where *N* is the iteration number. Each `iter_N` directory contains model weights and related artifacts. When using a checkpoint, make sure to provide the path to the appropriate `iter_N` directory. Hugging Face checkpoints converted for Megatron Bridge are typically stored in a directory named `iter_0000000`, as shown in the command above.
-:::
-
-:::{note}
-Megatron Bridge deployment for evaluation is supported only with Ray Serve and not PyTriton.
-:::
-
-## Evaluate Megatron Bridge Checkpoints
-
-Once deployment is successful, you can run evaluations using the NeMo Evaluator API. See {ref}`lib-core` for more details.
-
-Before starting the evaluation, it’s recommended to use the [`check_endpoint`](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator/src/nemo_evaluator/core/utils.py) function to verify that the endpoint is responsive and ready to accept requests.
-
-```{literalinclude} _snippets/mmlu.py
-:language: python
-:start-after: "## Run the evaluation"
-```
-
-## Evaluate Megatron Bridge Checkpoints on Log-probability Benchmarks
-
-To evaluate Megatron Bridge checkpoints on benchmarks that require log-probabilities, use the same deployment command provided in [Deploy Megatron Bridge Checkpoints](#deploy-megatron-bridge-checkpoints).
-
-For evaluation, you must specify the path to the `tokenizer` and set the `tokenizer_backend` parameter as shown below. The `tokenizer` files are located within the `tokenizer` directory of the checkpoint.
-
-```{literalinclude} _snippets/arc_challenge_mbridge.py
-:language: python
-:start-after: "## Run the evaluation"
-```
-
-## Evaluate Megatron Bridge Checkpoints on Chat Benchmarks
-
-To evaluate Megatron Bridge checkpoints on chat benchmarks you need the chat endpoint (/v1/chat/completions/). The deployment command provided in [Deploy Megatron Bridge Checkpoints](#deploy-megatron-bridge-checkpoints) also exposes the chat endpoint, and the same command can be used for evaluating on chat benchmarks.
-
-For evaluation, update the URL by replacing `/v1/completions/` with `/v1/chat/completions/` as shown below. Additionally, set the `type` field to `"chat"` to indicate a chat benchmark.
-
-```{literalinclude} _snippets/ifeval.py
-:language: python
-:start-after: "## Run the evaluation"
-```
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/deployment/launcher-orchestrated/index.md
-```md
----
-orphan: true
----
-(launcher-orchestrated-deployment)=
-
-# Launcher-Orchestrated Deployment
-
-Let NeMo Evaluator Launcher handle both model deployment and evaluation orchestration automatically. This is the recommended approach for most users, providing automated lifecycle management, multi-backend support, and integrated monitoring.
-
-## Overview
-
-Launcher-orchestrated deployment means the launcher:
-- Deploys your model using the specified deployment type
-- Manages the model serving lifecycle
-- Runs evaluations against the deployed model
-- Handles cleanup and resource management
-
-The launcher supports multiple deployment backends and execution environments.
-
-## Quick Start
-
-```bash
-# Deploy model and run evaluation in one command (Slurm example)
-HOSTNAME=cluster-login-node.com
-ACCOUNT=my_account
-OUT_DIR=/absolute/path/on/login/node
-
-nemo-evaluator-launcher run \
-    -o execution.hostname=$HOSTNAME \
-    -o execution.account=$ACCOUNT \
-    -o execution.output_dir=$OUT_DIR \
-    --config packages/nemo-evaluator-launcher/examples/slurm_vllm_basic.yaml
-```
-
-## Execution Backends
-
-Choose the execution backend that matches your infrastructure:
-
-::::{grid} 1 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`desktop-download;1.5em;sd-mr-1` Local Execution
-:link: local
-:link-type: doc
-Run evaluations on your local machine against existing endpoints. **Note**: Local executor does **not** deploy models. Use Slurm or Lepton for deployment.
-:::
-
-:::{grid-item-card} {octicon}`server;1.5em;sd-mr-1` Slurm Deployment
-:link: slurm
-:link-type: doc
-Deploy on HPC clusters with Slurm workload manager. Ideal for large-scale evaluations with multi-node parallelism.
-:::
-
-:::{grid-item-card} {octicon}`cloud;1.5em;sd-mr-1` Lepton Deployment
-:link: lepton
-:link-type: doc
-Deploy on Lepton AI cloud platform. Best for cloud-native deployments with managed infrastructure and auto-scaling.
-:::
-
-::::
-
-## Deployment Types
-
-The launcher supports multiple deployment types:
-
-### vLLM Deployment
-- **Fast inference** with optimized attention mechanisms
-- **Continuous batching** for high throughput
-- **Tensor parallelism** support for large models
-- **Memory optimization** with configurable GPU utilization
-
-### NIM Deployment  
-- **Production-grade reliability** with enterprise features
-- **NVIDIA optimized containers** for maximum performance
-- **Built-in monitoring** and logging capabilities
-- **Enterprise security** features
-
-### SGLang Deployment
-- **Structured generation** support for complex tasks
-- **Function calling** capabilities
-- **JSON mode** for structured outputs
-- **Efficient batching** for high throughput
-
-### No Deployment
-- **Use existing endpoints** without launcher deployment
-- **Bring-your-own-endpoint** integration
-- **Flexible configuration** for any OpenAI-compatible API
-
-## Configuration Overview
-
-Basic configuration structure for launcher-orchestrated deployment:
-
-```yaml
-# Use Hydra defaults to compose config
-defaults:
-  - execution: slurm/default  # or lepton/default; local does not deploy
-  - deployment: vllm  # or nim, sglang, none
-  - _self_
-
-# Deployment configuration
-deployment:
-  checkpoint_path: /path/to/model  # Or HuggingFace model ID
-  served_model_name: my-model
-  # ... deployment-specific options
-
-# Execution backend configuration
-execution:
-  account: my-account
-  output_dir: /path/to/results
-  # ... backend-specific options
-
-# Evaluation tasks
-evaluation:
-  tasks:
-    - name: mmlu_pro
-    - name: gsm8k
-```
-
-## Key Benefits
-
-### Automated Lifecycle Management
-- **Deployment automation**: No manual setup required
-- **Resource management**: Automatic allocation and cleanup  
-- **Error handling**: Built-in retry and recovery mechanisms
-- **Monitoring integration**: Real-time status and logging
-
-### Multi-Backend Support
-- **Consistent interface**: Same commands work across all backends
-- **Environment flexibility**: Local development to production clusters
-- **Resource optimization**: Backend-specific optimizations
-- **Scalability**: From single GPU to multi-node deployments
-
-### Integrated Workflows
-- **End-to-end automation**: From model to results in one command
-- **Configuration management**: Version-controlled, reproducible configs
-- **Result integration**: Built-in export and analysis tools
-- **Monitoring and debugging**: Comprehensive logging and status tracking
-
-## Getting Started
-
-1. **Choose your backend**: Start with {ref}`launcher-orchestrated-local` for development
-2. **Configure your model**: Set deployment type and model path
-3. **Run evaluation**: Use the launcher to deploy and evaluate
-4. **Monitor progress**: Check status and logs during execution
-5. **Analyze results**: Export and analyze evaluation outcomes
-
-## Next Steps
-
-- **Local Development**: Start with {ref}`launcher-orchestrated-local` for testing
-- **Scale Up**: Move to {ref}`launcher-orchestrated-slurm` for production workloads  
-- **Cloud Native**: Try {ref}`launcher-orchestrated-lepton` for managed infrastructure
-- **Configure Adapters**: Set up {ref}`adapters` for custom processing
-
-```{toctree}
-:maxdepth: 1
-:hidden:
-
-Local Deployment <local>
-Slurm Deployment <slurm>
-Lepton Deployment <lepton>
-```
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_nim.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# How to use:
-#
-# 1. copy this file locally or clone the repository
-# 2. set the NIM image and model name for your model
-# 3. replace /path/to/nim/cache with the absolute path to the NIM cache directory on your machine
-# 4. (optional) comment out limit_samples to run on the full dataset
-# 5. run `nemo-evaluator-launcher run --config path/to/local_nim.yaml`
-#
-# ⚠️  WARNING:
-#     Always run full evaluations (without limit_samples) for actual benchmark results.
-#     Using a subset of samples is solely for testing configuration and setup.
-#     Results from such test runs should NEVER be used to compare models or
-#     report benchmark performance.
-
-defaults:
-  - execution: local
-  - deployment: nim
-  - _self_
-
-execution:
-  output_dir: nel-results-nim
-  mounts:
-    deployment:
-      # Replace /path/to/nim/cache with the absolute path to the NIM cache directory on your machine
-      /path/to/nim/cache: /opt/nim/.cache
-
-# NIM deployment configuration
-# Note: model_id is auto-derived from deployment.served_model_name
-deployment:
-  image: nvcr.io/nim/meta/llama-3.2-1b-instruct:latest
-  served_model_name: meta/llama-3.2-1b-instruct
-  env_vars:
-    NGC_API_KEY: host:NGC_API_KEY
-
-# Specify the benchmarks to evaluate
-evaluation:
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        parallelism: 4
-        limit_samples: 10 # TEST ONLY: remove for full evaluation
-  tasks:
-    - name: lm-evaluation-harness.ifeval
-    - name: gsm8k
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/slurm_nim.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# How to use:
-#
-# 1. copy this file locally or clone the repository
-# 2. set the required values (marked with ???) or pass them via -o cli arguments, e.g.
-#    -o execution.hostname=my-cluster.com -o execution.output_dir=/path/on/cluster -o execution.account=my-account
-# 3. replace /path/to/nim/cache with the absolute path to the NIM cache directory on the cluster
-# 4. (optional) run with 10 samples for quick testing:
-#    -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10
-# 5. run full evaluation:
-#    nemo-evaluator-launcher run --config path/to/slurm_nim.yaml
-#
-# ⚠️  WARNING:
-#     Always run full evaluations (without limit_samples) for actual benchmark results.
-#     Using a subset of samples is solely for testing configuration and setup.
-#     Results from such test runs should NEVER be used to compare models or
-#     report benchmark performance.
-
-defaults:
-  - execution: slurm/default
-  - deployment: nim
-  - _self_
-
-# SLURM execution configuration
-execution:
-  hostname: ???  # SLURM headnode hostname (required)
-  account: ???  # SLURM account (required)
-  output_dir: ???  # ABSOLUTE path on cluster (required)
-  mounts:
-    deployment:
-      # Replace /path/to/nim/cache with the absolute path to the NIM cache directory on the cluster
-      /path/to/nim/cache: /opt/nim/.cache
-
-# NIM deployment configuration
-deployment:
-  image: nvcr.io/nim/meta/llama-3.2-1b-instruct:latest
-  served_model_name: meta/llama-3.2-1b-instruct
-  env_vars:
-    NGC_API_KEY: host:NGC_API_KEY
-
-# Specify the benchmarks to evaluate
-evaluation:
-  tasks:
-    - name: lm-evaluation-harness.ifeval
-    - name: gsm8k
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_vllm_logprobs.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# How to use:
-#
-# 1. copy this file locally or clone the repository
-# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
-# 3. run `nemo-evaluator-launcher run --config path/to/local_vllm_logprobs.yaml`
-
-# ⚠️  WARNING: 
-#     Always run full evaluations (without limit_samples) for actual benchmark results.
-#     Using a subset of samples is solely for testing configuration and setup.
-#     Results from such test runs should NEVER be used to compare models or
-#     report benchmark performance.
-
-# [docs-start-snippet]
-defaults:
-  - execution: local
-  - deployment: vllm
-  - _self_
-
-execution:
-  output_dir: llama_local
-deployment:
-  checkpoint_path: null
-  hf_model_handle: meta-llama/Llama-3.1-8B
-  served_model_name: meta-llama/Llama-3.1-8B
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  extra_args: "--max-model-len 32768"
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-
-# specify the benchmarks to evaluate
-evaluation:
-  # global config settings that apply to all tasks, unless overridden by task-specific config
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600  # timeout for API request in seconds
-        parallelism: 1  # 1 parallel request to avoid overloading the server
-        # limit_samples: 10 # uncomment to limit number of samples for quick testing
-        extra:  # for log-probability tasks like piqa, you need to specify the tokenizer
-          tokenizer: meta-llama/Llama-3.1-8B  # or use a path to locally stored checkpoint
-          tokenizer_backend: huggingface      # or "tiktoken"
-  env_vars:
-      HF_TOKEN: host:HF_TOKEN  # needed to access the tokenizer on the client side
-  tasks:
-    - name: piqa
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/default.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-# Top-level env vars applied to all jobs (deployment + evaluation).
-# Values use explicit prefixes: "host:VAR_NAME", "lit:value", "runtime:VAR_NAME".
-# Section-level and task-level env_vars override these.
-env_vars: {}
-
-# NOTE(dfridman): If deployment is used, `target` parameters will be automatically populated.
-target:
-  api_endpoint:
-    url: ???
-    model_id: ???
-    api_key_name: "<YOUR_API_KEY_NAME>" # NOTE: the name of the env var
-
-evaluation: []
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment/nim.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-type: nim
-image: ??? # e.g., nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
-served_model_name: ???
-port: 8000
-
-command: /opt/nim/start_server.sh
-
-# NIM containers use default entrypoint - no custom command needed
-# Configuration is done via environment variables in lepton_config
-
-endpoints:
-  chat: /v1/chat/completions
-  completions: /v1/completions
-  health: /v1/health/ready
-# Note: Environment variables should be configured in lepton_config.envs
-# Auto-derived environment variables from deployment config:
-# - SERVED_MODEL_NAME (from served_model_name)
-# - NIM_MODEL_NAME (from served_model_name for NIM)
-# - MODEL_PORT (from port)
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-type: vllm
-image: vllm/vllm-openai:latest
-checkpoint_path: ???
-served_model_name: ???
-port: 8000
-tensor_parallel_size: 8
-pipeline_parallel_size: 1
-data_parallel_size: 1
-gpu_memory_utilization: 0.95
-extra_args: ""
-env_vars: {} # {name: value} dict
-
-endpoints:
-  chat: /v1/chat/completions
-  completions: /v1/completions
-  health: /health
-
-command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint}
-  --tensor-parallel-size=${deployment.tensor_parallel_size}
-  --pipeline-parallel-size=${deployment.pipeline_parallel_size}
-  --data-parallel-size=${deployment.data_parallel_size}
-  --port ${deployment.port}
-  --trust-remote-code
-  --served-model-name ${deployment.served_model_name}
-  --gpu-memory-utilization ${deployment.gpu_memory_utilization}
-  ${deployment.extra_args}
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/deployment/nim.yaml
-```yaml
-defaults:
-  - deployment: nim
-
-execution:
-  env_vars:
-    deployment:
-      NGC_API_KEY: $NGC_API_KEY # Required for NIM container authentication
-  mounts:
-    deployment:
-      /path/to/nim/cache: /opt/nim/.cache # Replace with absolute path to NIM cache directory
-
-deployment:
-  image: ??? # NIM image (e.g., nvcr.io/nim/meta/llama-3.2-1b-instruct:latest)
-  served_model_name: ??? # Model name (e.g., meta/llama-3.2-1b-instruct)
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/deployment/vllm.yaml
-```yaml
-defaults:
-  - deployment: vllm
-
-execution:
-  env_vars:
-    deployment:
-      HF_TOKEN: $HF_TOKEN # Required for gated HuggingFace models
-
-deployment:
-  checkpoint_path: null # Set to path if using local checkpoint
-  hf_model_handle: ??? # HuggingFace model handle (e.g., meta-llama/Llama-3.1-8B)
-  served_model_name: ??? # Model name for API (e.g., meta-llama/Llama-3.1-8B)
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  extra_args: "--max-model-len 32768"
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/eval-standard-nlu.txt b/skills/nemotron-customize/context/eval-standard-nlu.txt
deleted file mode 100644
index 95a43ad69..000000000
--- a/skills/nemotron-customize/context/eval-standard-nlu.txt
+++ /dev/null
@@ -1,1920 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Evaluator
-├── docs
-│   └── evaluation
-│       ├── benchmarks
-│       │   ├── catalog
-│       │   │   └── index.md *
-│       │   └── about.md *
-│       ├── run-evals
-│       │   ├── index.md *
-│       │   ├── logprobs.md *
-│       │   ├── reasoning.md *
-│       │   └── text-gen.md *
-│       ├── index.md *
-│       └── parameters.md *
-└── packages
-    └── nemo-evaluator-launcher
-        ├── examples
-        │   ├── local_basic.yaml *
-        │   └── local_reasoning.yaml *
-        └── src
-            └── nemo_evaluator_launcher
-                └── resources
-                    └── config_templates
-                        └── evaluation
-                            ├── base
-                            │   ├── default.yaml *
-                            │   └── standard.yaml *
-                            └── chat
-                                └── default.yaml *
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/index.md
-```md
-(evaluation-overview)=
-
-# About Evaluation
-
-Evaluate LLMs, VLMs, agentic systems, and retrieval models across 100+ benchmarks using unified workflows.
-
-## Before You Start
-
-Before you run evaluations, ensure you have:
-
-1. **Chosen your approach**: See {ref}`get-started-overview` for installation and setup guidance
-2. **Deployed your model**: See {ref}`deployment-overview` for deployment options
-3. **OpenAI-compatible endpoint**: Your model must expose a compatible API (see {ref}`deployment-testing-compatibility`).
-4. **API credentials**: Access tokens for your model endpoint and Hugging Face Hub.
-
----
-
-## Quick Start: Academic Benchmarks
-
-:::{admonition} Fastest path to evaluate academic benchmarks
-:class: tip
-
-**For researchers and data scientists**: Evaluate your model on standard academic benchmarks in 3 steps.
-
-**Step 1: Choose Your Approach**
-- **Launcher CLI** (Recommended): `nemo-evaluator-launcher run --config packages/nemo-evaluator-launcher/examples/local_basic.yaml`
-- **Python API**: Direct programmatic control with `evaluate()` function
-
-**Step 2: Select Benchmarks**
-
-Common academic suites:
-- **General Knowledge**: `mmlu_pro`, `gpqa_diamond`
-- **Mathematical Reasoning**: `AIME_2025`, `mgsm`
-- **Instruction Following**: `ifbench`, `mtbench`
-
-
-
-Discover all available tasks:
-```bash
-nemo-evaluator-launcher ls tasks
-```
-
-**Step 3: Run Evaluation**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: mmlu_pro
-    - name: ifbench
-```
-
-Launch the job:
-
-```bash
-export NGC_API_KEY=nvapi-...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-<!-- **Next Steps**:
-- {ref}`text-gen` - Complete text generation guide
-- {ref}`eval-parameters` - Optimize configuration parameters
-- {ref}`eval-benchmarks` - Explore all available benchmarks -->
-:::
-
----
-
-## Evaluation Workflows
-
-Select a workflow based on your environment and desired level of control.
-
-::::{grid} 1 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`terminal;1.5em;sd-mr-1` Launcher Workflows
-:link: ../get-started/quickstart/launcher
-:link-type: doc
-Unified CLI for running evaluations across local, Slurm, and cloud backends with built-in result export.
-:::
-
-:::{grid-item-card} {octicon}`code;1.5em;sd-mr-1` Core API Workflows
-:link: ../libraries/nemo-evaluator/workflows/python-api
-:link-type: doc
-Programmatic evaluation using Python API for integration into ML pipelines and custom workflows.
-:::
-
-:::{grid-item-card} {octicon}`package;1.5em;sd-mr-1` Container Workflows
-:link: ../libraries/nemo-evaluator/containers/index
-:link-type: doc
-Direct container access for specialized use cases and custom evaluation environments.
-:::
-
-::::
-
-## Configuration and Customization
-
-Configure your evaluations, create custom tasks, explore benchmarks, and extend the framework with these guides.
-
-::::{grid} 1 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`plus;1.5em;sd-mr-1` Configuration Parameters
-:link: parameters
-:link-type: doc
-Comprehensive reference for evaluation configuration parameters and framework-specific settings.
-:::
-
-:::{grid-item-card} {octicon}`list-unordered;1.5em;sd-mr-1` Benchmark Catalog
-:link: eval-benchmarks
-:link-type: ref
-Explore 100+ available benchmarks across 18 evaluation harnesses and their specific use cases.
-:::
-
-:::{grid-item-card} {octicon}`plus;1.5em;sd-mr-1` Extend Framework
-:link: ../libraries/nemo-evaluator/extending/framework-definition-file/index
-:link-type: doc
-Add custom evaluation frameworks using Framework Definition Files for specialized benchmarks.
-:::
-
-::::
-
-## Advanced Features
-
-Scale your evaluations, export results, customize adapters, and resolve issues with these advanced features.
-
-::::{grid} 1 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`workflow;1.5em;sd-mr-1` Multi-Backend Execution
-:link: ../libraries/nemo-evaluator-launcher/configuration/executors/index
-:link-type: doc
-Run evaluations on local machines, HPC clusters, or cloud platforms with unified configuration.
-:::
-
-:::{grid-item-card} {octicon}`database;1.5em;sd-mr-1` Result Export
-:link: ../libraries/nemo-evaluator-launcher/exporters/index
-:link-type: doc
-Export evaluation results to MLflow, Weights & Biases, Google Sheets, and other platforms.
-:::
-
-:::{grid-item-card} {octicon}`shield;1.5em;sd-mr-1` Adapter System
-:link: ../libraries/nemo-evaluator/interceptors/index
-:link-type: doc
-Configure request/response processing, logging, caching, and custom interceptors.
-:::
-
-::::
-
-## Core Evaluation Concepts
-
-- For architectural details and core concepts, refer to {ref}`evaluation-model`.
-- For container specifications, refer to {ref}`nemo-evaluator-containers`.
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/benchmarks/about.md
-```md
-(eval-benchmarks)=
-
-# About Selecting Benchmarks
-
-NeMo Evaluator provides a comprehensive suite of benchmarks spanning academic reasoning, code generation, safety testing, and domain-specific evaluations. Whether you're validating a new model's capabilities or conducting rigorous academic research, you'll find the right benchmarks to assess your AI system's performance.
-See {ref}`benchmarks-full-list` for the complete catalog of available benchmarks.
-
-## Available via Launcher
-
-```{literalinclude} ../_snippets/commands/list_tasks.sh
-:language: bash
-:start-after: "# [snippet-start]"
-:end-before: "# [snippet-end]"
-```
-
-## Available via Direct Container Access
-
-```{literalinclude} ../_snippets/commands/list_tasks_core.sh
-:language: bash
-:start-after: "# [snippet-start]"
-:end-before: "# [snippet-end]"
-```
-
-## Choosing Benchmarks for Academic Research
-
-:::{admonition} Benchmark Selection Guide
-:class: tip
-
-**For General Knowledge**:
-- `mmlu_pro` - Expert-level knowledge across 14 domains
-- `gpqa_diamond` - Graduate-level science questions
-
-**For Mathematical & Quantitative Reasoning**:
-- `AIME_2025` - American Invitational Mathematics Examination (AIME) 2025 questions
-- `mgsm` - Multilingual math reasoning
-
-**For Instruction Following & Alignment**:
-- `ifbench` - Precise instruction following
-- `mtbench` - Multi-turn conversation quality
-
-See benchmark categories below and {ref}`benchmarks-full-list` for more details.
-:::
-
-## Benchmark Categories
-
-###  **Academic and Reasoning**
-
-```{list-table}
-:header-rows: 1
-:widths: 20 30 30 50
-
-* - Container
-  - Description
-  - NGC Catalog
-  - Benchmarks
-* - **simple-evals**
-  - Common evaluation tasks
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/simple-evals)
-  - GPQA-D, MATH-500, AIME 24 & 25, HumanEval, HumanEval+, MGSM, MMLU (also multilingual), MMLU-Pro, MMLU-lite (AR, BN, DE, EN, ES, FR, HI, ID, IT, JA, KO, MY, PT, SW, YO, ZH), SimpleQA, BrowseComp, HealthBench
-* - **lm-evaluation-harness**
-  - Language model benchmarks
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/lm-evaluation-harness)
-  - ARC Challenge (also multilingual), GSM8K, HumanEval, HumanEval+, MBPP, MBPP+, MINERVA Math, RACE, AGIEval, BBH, BBQ, CSQA, Frames, Global MMLU, GPQA-D, HellaSwag (also multilingual), IFEval, MGSM, MMLU, MMLU-Pro, MMLU-ProX (de, es, fr, it, ja), MMLU-Redux, MUSR, OpenbookQA, Piqa, Social IQa, TruthfulQA, WikiLingua, WinoGrande
-* - **hle**
-  - Academic knowledge and problem solving
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/hle)
-  - HLE
-* - **ifbench**
-  - Instruction following
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/ifbench)
-  - IFBench
-* - **mtbench**
-  - Multi-turn conversation evaluation
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mtbench)
-  - MT-Bench
-* - **nemo-skills**
-  - Language model benchmarks (science, math, agentic)
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/nemo_skills)
-  - AIME 24 & 25, BFCL_v3, GPQA, HLE, LiveCodeBench, MMLU, MMLU-Pro
-* - **profbench**
-  - Evaluation of professional knowledge accross Physics PhD, Chemistry PhD, Finance MBA and Consulting MBA
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/mtbench)
-  - Report Gerenation, LLM Judge
-```
-
-:::{note}
-BFCL tasks from the nemo-skills container require function calling capabilities. See {ref}`deployment-testing-compatibility` for checking if your endpoint is compatible.
-:::
-
-**Example Usage:**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: ifeval
-    - name: gsm8k_cot_instruct
-    - name: gpqa_diamond
-```
-
-Run evaluation:
-
-```bash
-export NGC_API_KEY=nvapi-...
-export HF_TOKEN=hf_...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-###  **Code Generation**
-
-```{list-table}
-:header-rows: 1
-:widths: 20 30 30 50
-
-* - Container
-  - Description
-  - NGC Catalog
-  - Benchmarks
-* - **bigcode-evaluation-harness**
-  - Code generation evaluation
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bigcode-evaluation-harness)
-  - MBPP, MBPP-Plus, HumanEval, HumanEval+, Multiple (cpp, cs, d, go, java, jl, js, lua, php, pl, py, r, rb, rkt, rs, scala, sh, swift, ts)
-* - **livecodebench**
-  - Coding
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/livecodebench)
-  - LiveCodeBench (v1-v6, 0724_0125, 0824_0225)
-* - **scicode**
-  - Coding for scientific research
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/scicode)
-  - SciCode
-```
-
-**Example Usage:**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: humaneval_instruct
-    - name: mbbp
-```
-
-Run evaluation:
-
-```bash
-export NGC_API_KEY=nvapi-...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-###  **Safety and Security**
-
-```{list-table}
-:header-rows: 1
-:widths: 20 30 30 50
-
-* - Container
-  - Description
-  - NGC Catalog
-  - Benchmarks
-* - **garak**
-  - Safety and vulnerability testing
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/garak)
-  - Garak
-* - **safety-harness**
-  - Safety and bias evaluation
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/safety-harness)
-  - Aegis v2, WildGuard
-```
-
-**Example Usage:**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: aegis_v2
-    - name: garak
-```
-
-Run evaluation:
-
-```bash
-export NGC_API_KEY=nvapi-...
-export HF_TOKEN=hf_...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-###  **Function Calling**
-
-```{list-table}
-:header-rows: 1
-:widths: 20 30 30 50
-
-* - Container
-  - Description
-  - NGC Catalog
-  - Benchmarks
-* - **bfcl**
-  - Function calling
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/bfcl)
-  - BFCL v2 and v3
-* - **tooltalk**
-  - Tool usage evaluation
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/tooltalk)
-  - ToolTalk
-```
-
-:::{note}
-Some of the tasks in this category require function calling capabilities. See {ref}`deployment-testing-compatibility` for checking if your endpoint is compatible.
-:::
-
-**Example Usage:**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: bfclv2_ast_prompting
-    - name: tooltalk
-```
-
-Run evaluation:
-
-```bash
-export NGC_API_KEY=nvapi-...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-
-###  **Vision-Language Models**
-
-```{list-table}
-:header-rows: 1
-:widths: 20 30 30 50
-
-* - Container
-  - Description
-  - NGC Catalog
-  - Benchmarks
-* - **vlmevalkit**
-  - Vision-language model evaluation
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/vlmevalkit)
-  - AI2D, ChartQA, MMMU, MathVista-MINI, OCRBench, SlideVQA
-```
-
-:::{note}
-The tasks in this category require a VLM chat endpoint. See {ref}`deployment-testing-compatibility` for checking if your endpoint is compatible.
-:::
-
-**Example Usage:**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: ocrbench
-    - name: chartqa
-```
-
-Run evaluation:
-
-```bash
-export NGC_API_KEY=nvapi-...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-###  **Domain-Specific**
-
-```{list-table}
-:header-rows: 1
-:widths: 20 30 30 50
-
-* - Container
-  - Description
-  - NGC Catalog
-  - Benchmarks
-* - **helm**
-  - Holistic evaluation framework
-  - [Link](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/eval-factory/containers/helm)
-  - MedHelm
-```
-
-**Example Usage:**
-
-Create `config.yml`:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-evaluation:
-  tasks:
-    - name: pubmed_qa
-    - name: medcalc_bench
-```
-
-Run evaluation:
-
-```bash
-export NGC_API_KEY=nvapi-...
-
-nemo-evaluator-launcher run \
-    --config ./config.yml \
-    -o execution.output_dir=results \
-    -o +target.api_endpoint.model_id=meta/llama-3.2-3b-instruct \
-    -o +target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
-    -o +target.api_endpoint.api_key_name=NGC_API_KEY
-```
-
-## Container Details
-
-For detailed specifications of each container, see {ref}`nemo-evaluator-containers`.
-
-### Quick Container Access
-
-Pull and run any evaluation container directly:
-
-```bash
-# Academic benchmarks
-docker pull nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }}
-docker run --rm -it nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }}
-
-# Code generation
-docker pull nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:{{ docker_compose_latest }}
-docker run --rm -it nvcr.io/nvidia/eval-factory/bigcode-evaluation-harness:{{ docker_compose_latest }}
-
-# Safety evaluation
-docker pull nvcr.io/nvidia/eval-factory/safety-harness:{{ docker_compose_latest }}
-docker run --rm -it nvcr.io/nvidia/eval-factory/safety-harness:{{ docker_compose_latest }}
-```
-
-### Available Tasks by Container
-
-For a complete list of available tasks in each container:
-
-```bash
-# List tasks in any container
-docker run --rm nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} nemo-evaluator ls
-
-# Or use the launcher for unified access
-nemo-evaluator-launcher ls tasks
-```
-
-## Integration Patterns
-
-NeMo Evaluator provides multiple integration options to fit your workflow:
-
-```bash
-# Launcher CLI (recommended for most users)
-nemo-evaluator-launcher ls tasks
-nemo-evaluator-launcher run --config ./local_mmlu_evaluation.yaml
-
-# Container direct execution
-docker run --rm nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }} nemo-evaluator ls
-
-# Python API (for programmatic control)
-# See the Python API documentation for details
-```
-
-## Benchmark Selection Best Practices
-
-### For Model Development
-
-**Iterative Testing**:
-- Start with `limit_samples=100` for quick feedback during development
-- Run full evaluations before major releases
-- Track metrics over time to measure improvement
-
-**Configuration**:
-```python
-# Development testing
-params = ConfigParams(
-    limit_samples=100,      # Quick iteration
-    temperature=0.01,       # Deterministic
-    parallelism=4
-)
-
-# Production evaluation
-params = ConfigParams(
-    limit_samples=None,     # Full dataset
-    temperature=0.01,       # Deterministic
-    parallelism=8          # Higher throughput
-)
-```
-
-### For Specialized Domains
-
-- **Code Models**: Focus on `humaneval`, `mbpp`, `livecodebench`
-- **Instruction Models**: Emphasize `ifbench`, `mtbench`
-- **Multilingual Models**: Include `arc_multilingual`, `hellaswag_multilingual`, `mgsm`
-- **Safety-Critical**: Prioritize `safety-harness` and `garak` evaluations
-
-
-## Next Steps
-
-- **Container Details**: Browse {ref}`nemo-evaluator-containers` for complete specifications
-- **Custom Benchmarks**: Learn {ref}`framework-definition-file` for custom evaluations
-
-
-
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/benchmarks/catalog/index.md
-```md
-(benchmarks-full-list)=
-# Available Benchmarks
-
-<!-- NOTE(agronskiy) below file is autogenerated -->
-```{include} all/benchmarks-table.md
-```
-
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/index.md
-```md
-(eval-run)=
-
-# Evaluation Techniques
-
-Follow step-by-step guides for different evaluation scenarios and methodologies in NeMo Evaluator.
-
-## Before You Start
-
-Ensure you have:
-
-1. Completed the initial getting started guides for {ref}`gs-install` and {ref}`gs-quickstart`.
-2. Have your endpoint and API key ready or prepared for the checkpoint you wish to deploy.
-3. Prepared your [Hugging Face token](https://huggingface.co/docs/hub/en/security-tokens) for accessing gated datasets.
-
-
-## Evaluations
-
-Select an evaluation type tailored to your model capabilities.
-
-::::{grid} 1 2 2 2
-:gutter: 1 1 1 2
-
-:::{grid-item-card} {octicon}`pencil;1.5em;sd-mr-1` Text Generation
-:link: text-gen
-:link-type: ref
-Measure model performance through natural language generation for academic benchmarks, reasoning tasks, and general knowledge assessment.
-:::
-
-:::{grid-item-card} {octicon}`graph;1.5em;sd-mr-1` Log-Probability
-:link: logprobs
-:link-type: ref
-Assess model confidence and uncertainty using log-probabilities for multiple-choice scenarios without text generation.
-:::
-
-:::{grid-item-card} {octicon}`comment;1.5em;sd-mr-1` Reasoning
-:link: run-eval-reasoning
-:link-type: ref
-Control the thinking budget and post-process the responses to extract the reasoning content and the final answer
-:::
-
-
-::::
-
-<!-- TODO: add once ready
-:::{grid-item-card} {octicon}`code;1.5em;sd-mr-1` Code Generation
-:link: code-generation
-:link-type: ref
-Measure programming capabilities through code generation, completion, and algorithmic problem solving.
-:::
-
-:::{grid-item-card} {octicon}`tools;1.5em;sd-mr-1` Function Calling
-:link: function-calling
-:link-type: ref
-Assess tool use capabilities, API calling accuracy, and structured output generation for agent-like behaviors.
-::: -->
-
-
-<!-- TODO: add once ready
-Code Generation <code-generation>
-Function Calling <function-calling> -->
-
-
-:::{toctree}
-:hidden:
-Text Generation <text-gen>
-Log Probability <logprobs>
-Reasoning <reasoning>
-:::
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/text-gen.md
-```md
-(text-gen)=
-
-# Text Generation Evaluation
-
-Text generation evaluation is the primary method for assessing LLM capabilities where models produce natural language responses to prompts. This approach evaluates the quality, accuracy, and appropriateness of generated text across various tasks and domains.
-
-
-:::{tip}
-In the example below we use the `gpqa_diamond` benchmark, but the instructions provided apply to all text generation tasks, such as:
-
-- `mmlu`
-- `mmlu_pro`
-- `ifeval`
-- `gsm8k`
-- `mgsm`
-- `mbpp`
-
-:::
-
-## Before You Start
-
-Ensure you have:
-
-- **Model Endpoint**: An OpenAI-compatible API endpoint for your model (completions or chat). See {ref}`deployment-testing-compatibility` for snippets you can use to test your endpoint.
-- **API Access**: Valid API key if your endpoint requires authentication
-- **Installed Packages**: NeMo Evaluator or access to evaluation containers
-
-## Evaluation Approach
-
-In text generation evaluation:
-
-1. **Prompt Construction**: Models receive carefully crafted prompts (questions, instructions, or text to continue)
-2. **Response Generation**: Models generate natural language responses using their trained parameters
-3. **Response Assessment**: Generated text is evaluated for correctness, quality, or adherence to specific criteria
-4. **Metric Calculation**: Numerical scores are computed based on evaluation criteria
-
-This differs from **log-probability evaluation** where models assign confidence scores to predefined choices.
-For log-probability methods, see the {ref}`logprobs`.
-
-
-## Use NeMo Evaluator Launcher
-
-Use an example config for evaluating the [Meta Llama 3.1 8B Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model:
-
-```{literalinclude} ../../../packages/nemo-evaluator-launcher/examples/local_basic.yaml
-:language: yaml
-:start-after: "[docs-start-snippet]"
-```
-
-
-To launch the evaluation, run:
-
-```bash
-
-export HF_TOKEN_FOR_GPQA_DIAMOND=hf_your-token-here  # GPQA is a gated dataset
-export NGC_API_KEY=nvapi-your-token-here  # API Key with access to build.nvidia.com
-
-nemo-evaluator-launcher run \
-  --config packages/nemo-evaluator-launcher/examples/local_basic.yaml
-```
-
-
-## Use NeMo Evaluator
-
-Start `simple-evals` docker container:
-
-```bash
-docker run --rm -it nvcr.io/nvidia/eval-factory/simple-evals:{{ docker_compose_latest }}
-```
-
-or install `nemo-evaluator` and `nvidia-simple-evals` Python package in your environment of choice:
-
-```bash
-pip install nemo-evaluator nvidia-simple-evals
-```
-
-### Run with CLI
-
-```bash
-export HF_TOKEN_FOR_GPQA_DIAMOND=hf_your-token-here  # GPQA is a gated dataset
-export NGC_API_KEY=nvapi-your-token-here  # API Key with access to build.nvidia.com
-
-# Run evaluation
-nemo-evaluator run_eval \
-    --eval_type gpqa_diamond \
-    --model_id meta/llama-3.2-3b-instruct \
-    --model_url https://integrate.api.nvidia.com/v1/chat/completions \
-    --model_type chat \
-    --api_key_name NGC_API_KEY \
-    --output_dir ./llama_3_1_8b_instruct_results
-```
-
-### Run with Python API
-
-```python
-# set env variables before entering Python:
-# export HF_TOKEN_FOR_GPQA_DIAMOND=hf_your-token-here  # GPQA is a gated dataset
-# export NGC_API_KEY=nvapi-your-token-here  # API Key with access to build.nvidia.com
-
-from nemo_evaluator.core.evaluate import evaluate
-from nemo_evaluator.api.api_dataclasses import (
-    ApiEndpoint, EvaluationConfig, EvaluationTarget, ConfigParams, EndpointType
-)
-
-# Configure target endpoint
-api_endpoint = ApiEndpoint(
-    url="https://integrate.api.nvidia.com/v1/chat/completions",
-    type=EndpointType.CHAT,
-    model_id="meta/llama-3.2-3b-instruct",
-    api_key="NGC_API_KEY"  # variable name storing the key
-)
-target = EvaluationTarget(api_endpoint=api_endpoint)
-
-# Configure evaluation task
-config = EvaluationConfig(
-    type="gpqa_diamond",
-    output_dir="./llama_3_1_8b_instruct_results"
-)
-
-# Execute evaluation
-results = evaluate(target_cfg=target, eval_cfg=config)
-```
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/logprobs.md
-```md
-(logprobs)=
-# Evaluate LLMs Using Log-Probabilities
-
-## Introduction
-
-While the most typical approach to LLM evaluation involves assessing the quality of a model's generated response to a question, an alternative method uses **log-probabilities**.
-
-In this approach, we quantify a model's "surprise" or uncertainty when processing a text sequence.
-This is done by calculating the sum of log-probabilities that the model assigns to each token.
-A higher sum indicates the model is more confident about the sequence.
-
-In this evaluation approach:
-* The LLM is given a single combined text containing both the question and a potential answer.
-* Next, the sum of log-probabilities is calculated only for the tokens that belong to the answer.
-* This allows an assessment of how likely it is that the model would provide that answer for the given question.
-
-For multiple-choice scenarios, the answer with the highest sum is treated as the one selected by the model.
-
-The sum of log-probabilities can be used to calculate different metrics, such as **perplexity**.
-Additionally, log-probabilities can be analyzed to assess whether a response would be generated by the model using greedy sampling—a method commonly employed to evaluate **accuracy**.
-
-Using log-probabilities is especially useful for evaluating base (pre-trained) models, as it eliminates the need for complex instruction-following and does not require the model to adhere to a specific output format.
-
-:::{tip}
-In the example below we use the `piqa` benchmark, but the instructions provided apply to all `lm-evaluation-harness` tasks utilizing log-probabilities, such as:
-
-- arc_challenge
-- arc_multilingual
-- bbh
-- commonsense_qa
-- hellaswag
-- hellaswag_multilingual
-- musr
-- openbookqa
-- social_iqa
-- truthfulqa
-- winogrande
-:::
-
-## Before You Start
-
-Ensure you have:
-
-- **Completions Endpoint**: Log-probability tasks require completions endpoints (not chat) that supports `logprobs` and `echo` parameters (see {ref}`compatibility-log-probs`)
-- **Model Tokenizer**: Access to tokenizer files for client-side tokenization (supported types: `huggingface` or `tiktoken`)
-- **API Access**: Valid API key for your model endpoint if it is gated
-- **Authentication**: Hugging Face token for gated datasets and tokenizers
-
-
-## Use NeMo Evaluator Launcher
-
-Use an example config for deploying and evaluating the [Meta Llama 3.1 8B](https://huggingface.co/meta-llama/Llama-3.1-8B) model:
-
-```{literalinclude} ../../../packages/nemo-evaluator-launcher/examples/local_vllm_logprobs.yaml
-:language: yaml
-:start-after: "[docs-start-snippet]"
-```
-
-To launch the evaluation, run:
-
-```bash
-nemo-evaluator-launcher run \
-  --config packages/nemo-evaluator-launcher/examples/local_vllm_logprobs.yaml
-```
-
-:::{tip}
-Set `deployment: none` and provide `target` specification if you want to evaluate an existing endpoint instead:
-
-```yaml
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-execution:
-  output_dir: llama_local
-  env_vars:
-    HF_TOKEN: ${oc.env:HF_TOKEN}  # needed to access meta-llama/Llama-3.1-8B gated model
- 
-target:
-  api_endpoint:
-    model_id: meta-llama/Llama-3.1-8B
-    url: https://your-endpoint.com/v1/completions
-    api_key_name: NGC_API_KEY # API Key with access to provided url
-
-# specify the benchmarks to evaluate
-evaluation:
-  nemo_evaluator_config:  # global config settings that apply to all tasks
-    config:
-      params:
-        extra:  # for log-probability tasks like piqa, you need to specify the tokenizer
-          tokenizer: meta-llama/Llama-3.1-8B  # or use a path to locally stored checkpoint
-          tokenizer_backend: huggingface      # or "tiktoken"
-  tasks:
-    - name: piqa
-
-```
-:::
-
-## Use NeMo Evaluator
-
-Start `lm-evaluation-harness` docker container:
-
-```bash
-docker run --rm -it nvcr.io/nvidia/eval-factory/lm-evaluation-harness:{{ docker_compose_latest }}
-```
-
-or install `nemo-evaluator` and `nvidia-lm-eval` Python package in your environment of choice:
-
-```bash
-pip install nemo-evaluator nvidia-lm-eval
-```
-
-
-To launch the evaluation, run the following Python code:
-
-```{literalinclude} ../_snippets/piqa_hf.py
-:language: python
-:start-after: "# [snippet-start]"
-:end-before: "# [snippet-end]"
-```
-
-Make sure to provide the source for the tokenizer and a backend for loading it.
-
-For models trained with NeMo Framework, the tokenizer is stored inside the checkpoint directory.
-For the NeMo format it is available inside `context/nemo_tokenizer` subdirectory:
-
-```python
-    extra={
-        "tokenizer": "/workspace/llama3_8b_nemo2/context/nemo_tokenizer",
-        "tokenizer_backend": "huggingface",
-    },
-```
-
-For Megatron Bridge checkpoints, the tokenizer is stored under `tokenizer` subdirectory:
-
-```python
-    extra={
-        "tokenizer": "/workspace/mbridge_llama3_8b/iter_0000000/tokenizer",
-        "tokenizer_backend": "huggingface",
-    },
-```
-
-
-## How it works
-
-When the server receives a `logprob=<int>` parameter in the request, it will return the log-probabilities of tokens.
-When combined with `echo=true`, the model will include the input in its response, along with the corresponding log-probabilities.
-
-Then the recieved response is processed on the client (benchmark) side to isolate the log-probabilities corresponding specifically to the answer portion of the input.
-For this purpose the input is tokenized, which allows to trace which log-probabilities originated from the question, and which from the answer.
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/run-evals/reasoning.md
-```md
-(run-eval-reasoning)=
-# Evaluation of Reasoning Models
-
-Reasoning models require a distinct approach compared to standard language models. Their outputs are typically longer, may contain dedicated reasoning tokens, and are more susceptible to generating loops or repetitive sequences. Evaluating these models effectively requires custom parameter settings and careful handling of generation constraints.
-
-## Before You Start
-
-Ensure you have:
-
-- **Model Endpoint**: An OpenAI-compatible API endpoint for your model (completions or chat). See {ref}`deployment-testing-compatibility` for snippets you can use to test your endpoint.
-- **API Access**: Valid API key if your endpoint requires authentication
-- **Installed Packages**: NeMo Evaluator or access to evaluation containers
-
-
-## Recommended Settings
-
-### Generation Settings
-
-Below are recommended generation settings for some popular reasoning-optimized models. These configurations should be included in the **model card**:
-
-| Model               | Temperature | Top-p  | Top-k  | 
-|---------------------|-------------|--------|--------|
-| **NVIDIA Nemotron** | 0.6         | 0.95   | —      |
-| **DeepSeek R1**     | 0.6         | 0.95   | —      |
-| **Qwen 230B**       | 0.6         | 0.95   | 20     |
-| **Phi-4 Reasoning** | 0.8         | 0.95   | 50     |
-
-
-### Token Configuration
-
-- `max_new_tokens` must be **significantly increased** for reasoning tasks as it includes the length of both reasoning trace and the final answer.
-- Check the model card to see settings recommended by the model creators.
-- It is important to observe if the specified `max_new_tokens` is enough for the model to finish reasoning.
-
-:::{tip}
-You can verify successful reasoning completion in the logs via the {ref}`interceptor-reasoning` Interceptor, for example:
-
-```
-[I 2025-12-02T16:14:28.257] Reasoning tracking information reasoning_words=1905 original_content_words=85 updated_content_words=85 reasoning_finished=True reasoning_started=True reasoning_tokens=unknown updated_content_tokens=unknown logger=ResponseReasoningInterceptor request_id=ccff76b2-2b85-4eed-a9d0-2363b533ae58
-```
-:::
-
-## Reasoning Output Formats
-
-Reasoning models produce outputs that contain both the **reasoning trace** (the model's step-by-step thinking process) and the **final answer**. The reasoning trace typically includes intermediate thoughts, calculations, and logical steps before arriving at the conclusion.
-
-There are two main ways to structure reasoning output:
-
-### 1. Wrapped with reasoning tokens
-
-e.g.
-
-```
-... </think>
-```
-
-```
-<think> ... </think>
-```
-
-or
-
-```
-<reason> ... </reason><final></final>
-```
-
-Most of the benchmarks expect only the final answer to be present in model's response.
-If your model endpoint replies with reasoning trace present in the main content, it needs to be removed from the assistant messages.
-You can do it using the {ref}`interceptor-reasoning` Interceptor.
-The interceptor will remove reasoning trace from the content and (optionally) track statistics for reasoning traces.
-
-:::{note}
-The `ResponseReasoningInterceptor` is by default configured for the `...</think>` and `<think> ...</think>` format. If your model uses these special tokens, you do not need to modify anything in your configuration.
-:::
-
-### 2. Returned as `reasoning_content` field in messages output
-
-If your model is deployed with e.g. vLLM, sglang or NIM, the reasoning part of the model's output is likely returned in the separate `reasoning_content` field in messages output (see [vLLM documentation](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html) and [sglang documentation](https://sgl-project.github.io/advanced_features/separate_reasoning.html)).
-
-In the messages returned by the endpoint, there are:
-
-- `reasoning_content`: The reasoning part of the output.
-- `content`: The content of the final answer.
-
-Conversely to the first method, this setup does not require any extra response parsing.
-However, in some benchmarks, errors may appear if the reasoning has not finished and the benchmark does not support empty answers in `content`.
-
-#### Enabling reasoning parser in vLLM
-
-To enable the `reasoning_content` field in vLLM, you need to pass the `--reasoning-parser` argument to the vLLM server.
-In NeMo Evaluator Launcher, you can do this via `deployment.extra_args`:
-
-```yaml
-deployment:
-  hf_model_handle: Qwen/Qwen3-Next-80B-A3B-Thinking
-  extra_args: "--reasoning-parser deepseek_r1"
-```
-
-Available reasoning parsers depend on your vLLM version. Common options include `deepseek_r1` for models using `<think>...</think>` format.
-See the [vLLM reasoning outputs documentation](https://docs.vllm.ai/en/stable/features/reasoning_outputs.html) for details.
-
----
-
-## Control the Reasoning Effort
-
-Some models allow turning reasoning on/off or setting its level of effort. There are usually 2 ways of doing it:
-
-- **Special instruction in the system prompt**
-- **Extra parameters passed to the chat_template**
-
-:::{tip}
-Check the model card and documentation of the deployment of your choice to see how you can control the reasoning effort for your model.
-If there are several options available, it is recommended to use the dedicated chat template parameters over the system prompt.
-:::
-
-### Control reasoning with the system prompt
-
-In this example we will use the [NVIDIA-Nemotron-Nano-9B-v2](https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2/modelcard) model.
-This model allows you to control the reasoning effort by including `/think` or `/no_think` in the system prompt, e.g.:
-
-
-```json
-{
-  "model": "nvidia/nvidia-nemotron-nano-9b-v2",
-  "messages": [
-    {"role": "system", "content": "You are a helpful assistant. /think"},
-    {"role": "user", "content": "What is 2+2?"}
-  ],
-  "temperature": 0.6,   
-  "top_p": 0.95,
-  "max_tokens": 32768
-}
-```
-
-When launching the evaluation, we can use the {ref}`interceptor-system-messages` Interceptor to add `/think` or `/no_think` to the system prompt.
-
-
-```yaml
-config:
-  params:
-    temperature: 0.6
-    top_p: 0.95
-    max_new_tokens: 32768  # for reasoning + final answer
-target:
-  api_endpoint:
-    adapter_config:
-      process_reasoning_traces: true # strips reasoning tokens and collects reasoning stats
-      use_system_prompt: true # turn reasoning on with special system prompt
-      custom_system_prompt: >-
-        "/think"
-```
-
-
-### Control reasoning with additional parameters
-
-In this example we will use the [Granite-3.3-8B-Instruct](https://build.nvidia.com/ibm/granite-3_3-8b-instruct/modelcard) model.
-Conversely to NVIDIA-Nemotron-Nano-9B-v2, this model allows you to turn the reasoning on with an additional `thinking` parameter passed to the chat template:
-
-```json
-{
-  "model": "ibm/granite-3.3-8b-instruct",
-  "messages": [
-    {
-      "role": "user",
-      "content": "What is 2+2?"
-    }
-  ],
-  "temperature": 0.2,
-  "top_p": 0.7,
-  "max_tokens": 8192,
-  "seed": 42,
-  "stream": true,
-  "chat_template_kwargs": {
-    "thinking": true
-  }
-}
-```
-
-When running the evaluation, use the {ref}`interceptor-payload-modification` Interceptor to add this parameter to benchmarks' requests:
-
-```yaml
-config:
-  params:
-    temperature: 0.6
-    top_p: 0.95
-    max_new_tokens: 32768  # for reasoning + final answer
-target:
-  api_endpoint:
-    adapter_config:
-      process_reasoning_traces: true 
-      params_to_add:
-        chat_template_kwargs:
-          thinking: true
-```
-
-
-## Benchmarks for Reasoning
-
-Reasoning models excel at tasks that require multi-step thinking, logical deduction, and complex problem-solving. The following benchmark categories are particularly well-suited for evaluating reasoning capabilities:
-
-
-- **CoT tasks**: e.g., AIME, Math, GPQA-diamond
-- **Coding**: e.g., scicodebench, livedocebench
-
-
-:::{tip}
-When evaluating your model on a task that does not require step-by-step thinking, consider turning the reasoning off or lowering the thinking budget.
-:::
-
-
-## Full Working Example
-
-### Run the evaluation
-
-An example config is available in `packages/nemo-evaluator-launcher/examples/local_reasoning.yaml`:
-
-```{literalinclude} ../../../packages/nemo-evaluator-launcher/examples/local_reasoning.yaml
-:language: yaml
-:start-after: "[docs-start-snippet]"
-```
-
-To launch the evaluation, run:
-
-```bash
-export NGC_API_KEY=nvapi-...
-nemo-evaluator-launcher run \
-  --config packages/nemo-evaluator-launcher/examples/local_reasoning.yaml
-```
-
-### Analyze the artifacts
-
-NeMo Evaluator produces several artifacts for analysis after evaluation completion.
-The primary output file is `results.yaml`, which stores the metrics produced by the benchmark (see {ref}`evaluation-output` for more details).
-
-The `eval_factory_metrics.json` file provides valuable insights into your model's behavior.
-When the reasoning interceptor is enabled, this file contains a `reasoning` key that stores statistics about reasoning traces in your model's responses:
-
-```json
-"reasoning": {
-    "description": "Reasoning statistics saved during processing",
-    "total_responses": 3672,
-    "responses_with_reasoning": 2860,
-    "reasoning_finished_count": 2860,
-    "reasoning_finished_ratio": 1.0,
-    "reasoning_started_count": 2860,
-    "reasoning_unfinished_count": 0,
-    "avg_reasoning_words": 153.21,
-    "avg_original_content_words": 192.17,
-    "avg_updated_content_words": 38.52,
-    "max_reasoning_words": 806,
-    "max_original_content_words": 863,
-    "max_updated_content_words": 863,
-    "max_reasoning_tokens": null,
-    "avg_reasoning_tokens": null,
-    "max_updated_content_tokens": null,
-    "avg_updated_content_tokens": null,
-    "total_reasoning_words": 561696,
-    "total_original_content_words": 705555,
-    "total_updated_content_words": 140999,
-    "total_reasoning_tokens": 0,
-    "total_updated_content_tokens": 0
-  },
-```
-
-In the example above, the model used reasoning for 2860 out of 3672 responses (approximately 78%).
-
-The matching values for `reasoning_started_count` and `reasoning_finished_count` (and `reasoning_unfinished_count` being 0) indicate that the `max_new_tokens` parameter was set sufficiently high, allowing the model to complete all reasoning traces without truncation.
-
-These statistics also enable cost analysis for reasoning operations.
-While the endpoint in this example does not return reasoning token usage statistics (the `*_tokens` fields are null or zero), you can still analyze computational cost using the word count metrics from the responses.
-
-For more information on available artifacts, see {ref}`evaluation-output`.
-
-```
-
-File: /Users/mromeijn/src/Evaluator/docs/evaluation/parameters.md
-```md
-(eval-parameters)=
-
-# Evaluation Configuration Parameters
-
-Comprehensive reference for configuring evaluation tasks in {{ product_name_short }}, covering universal parameters, framework-specific settings, and optimization patterns.
-
-:::{admonition} Quick Navigation
-:class: info
-
-**Looking for available benchmarks?**
-- {ref}`eval-benchmarks` - Browse available benchmarks by category
-
-**Need help getting started?**
-- {ref}`evaluation-overview` - Overview of evaluation workflows
-- {ref}`eval-run` - Step-by-step evaluation guides
-:::
-
-## Overview
-
-All evaluation tasks in {{ product_name_short }} use the {ref}`ConfigParams <modelling-inout>` class for configuration. This provides a consistent interface across different evaluation harnesses while allowing framework-specific customization through the `extra` parameter. Default configuration (including which parameters a task uses) is defined in the **Framework Definition File (FDF)** for each framework; see {ref}`framework-definition-file` for details.
-
-
-```python
-from nemo_evaluator.api.api_dataclasses import ConfigParams
-
-# Basic configuration
-params = ConfigParams(
-    temperature=0,
-    top_p=1.0,
-    max_new_tokens=256,
-    limit_samples=100
-)
-
-# With framework-specific parameters (extra)
-params = ConfigParams(
-    temperature=0,
-    parallelism=8,
-    extra={
-        "num_fewshot": 5,
-        "tokenizer": "/path/to/tokenizer",
-        "custom_prompt": "Answer the question:"
-    }
-)
-```
-
-:::{admonition} How to see possible parameters for a given task
-:class: important
-
-**Python API (core)** — Get default params and which params a task uses. Use `framework_name.task_name` to avoid ambiguity when the same task name exists in multiple harnesses:
-
-```python
-from nemo_evaluator.core.input import get_available_evaluations
-
-# Returns (framework_evals_mapping, framework_defaults, all_eval_name_mapping)
-framework_evals, _, _ = get_available_evaluations()
-
-# Use framework_name.task_name (e.g. simple_evals.mmlu_pro) for a single task
-framework_name, task_name = "simple_evals", "mmlu_pro"
-eval_obj = framework_evals[framework_name][task_name]
-
-# Default params for this task (ConfigParams / dict-like)
-print(eval_obj.config.params)
-
-# Command template shows which {{ config.params.* }} the task uses
-print(eval_obj.command)
-```
-
-**CLI (core)** — List tasks, then show merged config (including params) for a task:
-
-```bash
-# List available tasks
-nemo-evaluator ls
-
-# Show full rendered config (including config.params) for a task without running
-# Use framework_name.task_name (e.g. simple_evals.mmlu_pro) to avoid ambiguity
-nemo-evaluator run_eval --eval_type simple_evals.mmlu_pro --model_id x --model_url https://example.com/v1/chat/completions --model_type chat --output_dir ./out --dry_run
-```
-
-The `--dry_run` output prints the merged configuration (YAML) and the rendered command, so you can see which parameters apply to that task.
-
-**Launcher** — If you use the launcher, `nemo-evaluator-launcher ls task <task_name>` (or `harness.task_name`) prints task details including **Defaults** with `config.params` and `config.params.extra`. List all tasks with `nemo-evaluator-launcher ls tasks`.
-:::
-
-## Universal Parameters
-
-These parameters are standardized across all frameworks and share the same names and semantics. That does **not** mean every framework supports every parameter: each task’s command template only uses a subset. If you pass a parameter that the task does not use, you will see a warning like: *"Configuration contains parameter(s) that are not used in the command template"* (see `validate_params_in_command` in `nemo_evaluator.core.utils`). 
-
-```{list-table}
-:header-rows: 1
-:widths: 12 14 10 28 22 14
-
-* - Category
-  - Parameter
-  - Type
-  - Description
-  - Example Values
-  - Notes
-* - Sampling
-  - `temperature`
-  - `float`
-  - Sampling randomness
-  - `0` (deterministic), `0.7` (creative)
-  - Use `0` for reproducible results
-* - Sampling
-  - `top_p`
-  - `float`
-  - Nucleus sampling threshold
-  - `1.0` (disabled), `0.9` (selective)
-  - Controls diversity of generated text
-* - Sampling
-  - `max_new_tokens`
-  - `int`
-  - Maximum response length
-  - `256`, `512`, `1024`
-  - Limits generation length
-* - Evaluation control
-  - `limit_samples`
-  - `int/float`
-  - Evaluation subset size
-  - `100` (count), `0.1` (10% of dataset)
-  - Use for quick testing or resource limits
-* - Evaluation control
-  - `task`
-  - `str`
-  - Task-specific identifier
-  - `"custom_task"`
-  - Used by some harnesses for task routing
-* - Performance
-  - `parallelism`
-  - `int`
-  - Concurrent request threads
-  - `1`, `8`, `16`
-  - Balance against server capacity
-* - Performance
-  - `max_retries`
-  - `int`
-  - Retry attempts for failed requests
-  - `3`, `5`, `10`
-  - Increases robustness for network issues
-* - Performance
-  - `request_timeout`
-  - `int`
-  - Request timeout (seconds)
-  - `60`, `120`, `300`
-  - Adjust for model response time
-```
-
-## Framework-Specific Parameters
-
-Framework-specific parameters are passed through the `extra` dictionary within `ConfigParams`.
-
-::::{dropdown} LM-Evaluation-Harness Parameters
-:icon: code-square
-
-```{list-table}
-:header-rows: 1
-:widths: 15 10 30 25 20
-
-* - Parameter
-  - Type
-  - Description
-  - Example Values
-  - Use Cases
-* - `num_fewshot`
-  - `int`
-  - Few-shot examples count
-  - `0`, `5`, `25`
-  - Academic benchmarks
-* - `tokenizer`
-  - `str`
-  - Tokenizer path
-  - `"/path/to/tokenizer"`
-  - Log-probability tasks
-* - `tokenizer_backend`
-  - `str`
-  - Tokenizer implementation
-  - `"huggingface"`, `"sentencepiece"`
-  - Custom tokenizer setups
-* - `trust_remote_code`
-  - `bool`
-  - Allow remote code execution
-  - `True`, `False`
-  - For custom tokenizers
-* - `add_bos_token`
-  - `bool`
-  - Add beginning-of-sequence token
-  - `True`, `False`
-  - Model-specific formatting
-* - `add_eos_token`
-  - `bool`
-  - Add end-of-sequence token
-  - `True`, `False`
-  - Model-specific formatting
-* - `fewshot_delimiter`
-  - `str`
-  - Separator between examples
-  - `"\\n\\n"`, `"\\n---\\n"`
-  - Custom prompt formatting
-* - `fewshot_seed`
-  - `int`
-  - Reproducible example selection
-  - `42`, `1337`
-  - Ensures consistent few-shot examples
-* - `description`
-  - `str`
-  - Custom prompt prefix
-  - `"Answer the question:"`
-  - Task-specific instructions
-* - `bootstrap_iters`
-  - `int`
-  - Statistical bootstrap iterations
-  - `1000`, `10000`
-  - For confidence intervals
-```
-
-::::
-
-::::{dropdown} Simple-Evals Parameters
-:icon: code-square
-
-```{list-table}
-:header-rows: 1
-:widths: 15 10 30 25 20
-
-* - Parameter
-  - Type
-  - Description
-  - Example Values
-  - Use Cases
-* - `pass_at_k`
-  - `list[int]`
-  - Code evaluation metrics
-  - `[1, 5, 10]`
-  - Code generation tasks
-* - `timeout`
-  - `int`
-  - Code execution timeout
-  - `5`, `10`, `30`
-  - Code generation tasks
-* - `max_workers`
-  - `int`
-  - Parallel execution workers
-  - `4`, `8`, `16`
-  - Code execution parallelism
-* - `languages`
-  - `list[str]`
-  - Target programming languages
-  - `["python", "java", "cpp"]`
-  - Multi-language evaluation
-```
-
-::::
-
-::::{dropdown} BigCode-Evaluation-Harness Parameters
-:icon: code-square
-
-```{list-table}
-:header-rows: 1
-:widths: 15 10 30 25 20
-
-* - Parameter
-  - Type
-  - Description
-  - Example Values
-  - Use Cases
-* - `num_workers`
-  - `int`
-  - Parallel execution workers
-  - `4`, `8`, `16`
-  - Code execution parallelism
-* - `eval_metric`
-  - `str`
-  - Evaluation metric
-  - `"pass_at_k"`, `"bleu"`
-  - Different scoring methods
-* - `languages`
-  - `list[str]`
-  - Programming languages
-  - `["python", "javascript"]`
-  - Language-specific evaluation
-```
-
-::::
-
-::::{dropdown} Safety and Specialized Harnesses
-:icon: code-square
-
-```{list-table}
-:header-rows: 1
-:widths: 15 10 30 25 20
-
-* - Parameter
-  - Type
-  - Description
-  - Example Values
-  - Use Cases
-* - `probes`
-  - `str`
-  - Garak security probes
-  - `"ansiescape.AnsiEscaped"`
-  - Security evaluation
-* - `detectors`
-  - `str`
-  - Garak security detectors
-  - `"base.TriggerListDetector"`
-  - Security evaluation
-* - `generations`
-  - `int`
-  - Number of generations per prompt
-  - `1`, `5`, `10`
-  - Safety evaluation
-```
-
-::::
-
-## Parameter Selection Guidelines
-
-- Configure `parallelism` and `request_timeout` based on server capacity.
-- Use `limit_samples` for subset evaluation (e.g. for debugging or quick validation).
-
-## Common Configuration Errors
-
-### Tokenizer Issues
-
-:::{admonition} Problem
-:class: error
-Missing tokenizer for log-probability tasks
-
-```python
-# Incorrect - missing tokenizer
-params = ConfigParams(extra={})
-```
-:::
-
-:::{admonition} Solution
-:class: tip
-Always specify tokenizer for log-probability tasks
-
-```python
-# Correct
-params = ConfigParams(
-    extra={
-        "tokenizer_backend": "huggingface",
-        "tokenizer": "/path/to/nemo_tokenizer"
-    }
-)
-```
-:::
-
-### Performance Issues
-
-:::{admonition} Problem
-:class: error
-Excessive parallelism overwhelming server
-
-```python
-# Incorrect - too many concurrent requests
-params = ConfigParams(parallelism=100)
-```
-:::
-
-:::{admonition} Solution
-:class: tip
-Start conservative and scale up
-
-```python
-# Correct - reasonable concurrency
-params = ConfigParams(parallelism=8, max_retries=3)
-```
-:::
-
-### Parameter Conflicts
-
-:::{admonition} Problem
-:class: error
-Mixing generation and log-probability parameters
-
-```python
-# Incorrect - generation params unused for log-probability
-params = ConfigParams(
-    temperature=0.7,  # Ignored for log-probability tasks
-    extra={"tokenizer": "/path"}
-)
-```
-:::
-
-:::{admonition} Solution
-:class: tip
-Use appropriate parameters for task type
-
-```python
-# Correct - only relevant parameters
-params = ConfigParams(
-    limit_samples=100,  # Relevant for all tasks
-    extra={"tokenizer": "/path"}  # Required for log-probability
-)
-```
-:::
-
-## Next Steps
-
-- **Basic Usage**: See {ref}`text-gen` for getting started
-- **Custom Tasks**: Learn {ref}`eval-custom-tasks` for specialized evaluations
-- **Troubleshooting**: Refer to {ref}`troubleshooting-index` for common issues
-- **Benchmarks**: Browse {ref}`eval-benchmarks` for task-specific recommendations
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_basic.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# How to use:
-#
-# 1. copy this file locally or clone the repository
-# 2. (optional) comment out limit_samples in the config file to run on the full dataset
-# 3. run `nemo-evaluator-launcher run --config path/to/local_basic.yaml`
-#
-# ⚠️  WARNING:
-#     Always run full evaluations (without limit_samples) for actual benchmark results.
-#     Using a subset of samples is solely for testing configuration and setup.
-#     Results from such test runs should NEVER be used to compare models or
-#     report benchmark performance.
-
-# [docs-start-snippet]
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-execution:
-  output_dir: nel-results
-
-target:
-  api_endpoint:
-    # see https://build.nvidia.com/meta/llama-3_1-8b-instruct for endpoint details
-    model_id: meta/llama-3.2-3b-instruct
-    url: https://integrate.api.nvidia.com/v1/chat/completions
-    api_key_name: NGC_API_KEY # API Key with access to build.nvidia.com
-
-# specify the benchmarks to evaluate
-evaluation:
-  # global config settings that apply to all tasks, unless overridden by task-specific config
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600  # timeout for API request in seconds
-        parallelism: 1  # 1 parallel request to avoid overloading the server
-        limit_samples: 10 # TEST ONLY: Limits all benchmarks to 10 samples for quick testing
-  tasks:
-    - name: lm-evaluation-harness.ifeval
-    - name: simple_evals.gpqa_diamond
-      env_vars:
-        HF_TOKEN: host:HF_TOKEN_FOR_GPQA_DIAMOND # Click request access for GPQA-Diamond: https://huggingface.co/datasets/Idavidrein/gpqa
-    - name: bigcode-evaluation-harness.mbpp-chat
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/examples/local_reasoning.yaml
-```yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# How to use:
-#
-# 1. copy this file locally or clone the repository
-# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
-# 3. run `nemo-evaluator-launcher run --config path/to/local_reasoning.yaml`
-
-# ⚠️  WARNING:
-#     Always run full evaluations (without limit_samples) for actual benchmark results.
-#     Using a subset of samples is solely for testing configuration and setup.
-#     Results from such test runs should NEVER be used to compare models or
-#     report benchmark performance.
-
-# [docs-start-snippet]
-defaults:
-  - execution: local
-  - deployment: none
-  - _self_
-
-execution:
-  output_dir: nel-results
-
-target:
-  api_endpoint:
-    # see https://build.nvidia.com/nvidia/nvidia-nemotron-nano-9b-v2 for endpoint details
-    model_id: nvidia/nvidia-nemotron-nano-9b-v2
-    url: https://integrate.api.nvidia.com/v1/chat/completions
-    api_key_name: NGC_API_KEY # API Key with access to build.nvidia.com
-
-evaluation:
-  # global config settings that apply to all tasks, unless overridden by task-specific config
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600  # timeout for API request in seconds
-        parallelism: 1  # 1 parallel request to avoid overloading the server
-        # limit_samples: 10 # uncomment to limit number of samples for quick testing
-
-  tasks:
-    # run complex tasks with reasoning on
-    - name: simple_evals.mmlu_pro
-      nemo_evaluator_config:
-        config:
-          params:
-            temperature: 0.6
-            top_p: 0.95
-            max_new_tokens: 32768  # for reasoning + final answer
-        target:
-          api_endpoint:
-            adapter_config:
-              process_reasoning_traces: true # strips reasoning tokens and collects reasoning stats
-              use_system_prompt: true # turn reasoning on with special system prompt
-              custom_system_prompt: >-
-                "/think"
-
-    # run simpler tasks with reasoning off
-    - name: lm-evaluation-harness.ifeval
-      nemo_evaluator_config:
-        config:
-          params:
-            max_new_tokens: 1024 # we can use less tokens with reasoning off
-        target:
-          api_endpoint:
-            adapter_config:
-              use_system_prompt: true # turn reasoning off with special system prompt
-              custom_system_prompt: >-
-                "/no_think"
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/evaluation/base/default.yaml
-```yaml
-# Base model evaluation configuration (completions endpoint)
-# Uses log-probabilities for multiple-choice tasks
-# See: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/run-evals/logprobs.html
-evaluation:
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        parallelism: 64
-        extra:
-          # Tokenizer required for log-probability tasks
-          tokenizer: ??? # HuggingFace model handle or path to tokenizer (must match evaluated model)
-          tokenizer_backend: huggingface # or "tiktoken"
-  env_vars:
-    HF_TOKEN: HF_TOKEN # Required to access gated tokenizers
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/evaluation/base/standard.yaml
-```yaml
-# Standard LLM Benchmarks for base models (completions endpoint, log-probability based)
-# These tasks use log-probabilities to assess model confidence on answer choices
-evaluation:
-  tasks:
-    - name: lm-evaluation-harness.mmlu # Log-prob based multiple choice
-    - name: lm-evaluation-harness.gpqa # Log-prob based (completions version)
-    - name: lm-evaluation-harness.arc_challenge # Log-prob based
-    - name: lm-evaluation-harness.hellaswag # Log-prob based
-    - name: lm-evaluation-harness.commonsense_qa # Log-prob based
-
-```
-
-File: /Users/mromeijn/src/Evaluator/packages/nemo-evaluator-launcher/src/nemo_evaluator_launcher/resources/config_templates/evaluation/chat/default.yaml
-```yaml
-# Chat model evaluation configuration (chat endpoint)
-evaluation:
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        parallelism: 64
-        temperature: 0.0 # Deterministic for reproducibility
-        max_new_tokens: 2048
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/mbridge-parallelism-performance.txt b/skills/nemotron-customize/context/mbridge-parallelism-performance.txt
deleted file mode 100644
index cc00ec1a0..000000000
--- a/skills/nemotron-customize/context/mbridge-parallelism-performance.txt
+++ /dev/null
@@ -1,6791 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Megatron-Bridge
-├── docs
-│   ├── training
-│   │   ├── images
-│   │   ├── activation-recomputation.md *
-│   │   ├── communication-overlap.md *
-│   │   ├── hybrid-context-parallel.md *
-│   │   ├── megatron-fsdp.md *
-│   │   ├── mixed-precision.md *
-│   │   └── packed-sequences.md *
-│   ├── images
-│   ├── modelopt
-│   ├── models
-│   │   ├── llm
-│   │   └── vlm
-│   ├── releases
-│   ├── parallelisms.md *
-│   └── performance-guide.md *
-├── skills
-│   ├── perf-techniques
-│   │   ├── cuda-graphs
-│   │   │   └── SKILL.md *
-│   │   ├── parallelism-strategies
-│   │   │   └── SKILL.md *
-│   │   ├── sequence-packing
-│   │   │   └── SKILL.md *
-│   │   ├── tp-dp-comm-overlap
-│   │   │   └── SKILL.md *
-│   │   ├── expert-parallel-overlap
-│   │   ├── hybrid-context-parallel
-│   │   ├── megatron-fsdp
-│   │   ├── moe-comm-overlap
-│   │   └── packed-sequences-long-context
-│   ├── adding-model-support
-│   ├── code-style
-│   ├── developer-guide
-│   ├── mlm-bridge-training
-│   ├── multi-node-slurm
-│   ├── parity-testing
-│   └── resiliency
-├── src
-│   └── megatron
-│       └── bridge
-│           ├── training
-│           │   ├── comm_overlap.py * +
-│           │   ├── config.py * +
-│           │   ├── initialize.py * +
-│           │   ├── ...
-│           ├── data
-│           │   └── ...
-│           ├── diffusion
-│           │   └── ...
-│           ├── inference
-│           │   └── ...
-│           ├── models
-│           │   └── ...
-│           ├── peft
-│           ├── recipes
-│           │   └── ...
-│           └── utils
-├── .github
-│   ├── ISSUE_TEMPLATE
-│   ├── actions
-│   │   └── test-template
-│   └── workflows
-│       └── config
-├── .specstory
-├── 3rdparty
-│   └── Megatron-LM
-│       ├── .github
-│       │   ├── ISSUE_TEMPLATE
-│       │   ├── actions
-│       │   │   └── ...
-│       │   ├── scripts
-│       │   └── workflows
-│       │       └── ...
-│       ├── .gitlab
-│       │   ├── scripts
-│       │   └── stages
-│       ├── docker
-│       │   ├── common
-│       │   └── patches
-│       ├── docs
-│       │   ├── advanced
-│       │   ├── api-guide
-│       │   │   └── ...
-│       │   ├── developer
-│       │   ├── discussions
-│       │   │   └── ...
-│       │   ├── get-started
-│       │   ├── images
-│       │   │   └── ...
-│       │   ├── models
-│       │   └── user-guide
-│       │       └── ...
-│       ├── examples
-│       │   ├── academic_paper_scripts
-│       │   │   └── ...
-│       │   ├── bert
-│       │   ├── export
-│       │   │   └── ...
-│       │   ├── gpt3
-│       │   ├── inference
-│       │   │   └── ...
-│       │   ├── llama
-│       │   ├── mamba
-│       │   ├── mimo
-│       │   │   └── ...
-│       │   ├── mixtral
-│       │   ├── multimodal
-│       │   │   └── ...
-│       │   ├── post_training
-│       │   │   └── ...
-│       │   ├── rl
-│       │   │   └── ...
-│       │   └── t5
-│       ├── images
-│       ├── megatron
-│       │   ├── core
-│       │   │   └── ...
-│       │   ├── inference
-│       │   ├── legacy
-│       │   │   └── ...
-│       │   ├── post_training
-│       │   ├── rl
-│       │   │   └── ...
-│       │   └── training
-│       │       └── ...
-│       ├── scripts
-│       ├── tasks
-│       ├── tests
-│       │   ├── functional_tests
-│       │   │   └── ...
-│       │   ├── test_utils
-│       │   │   └── ...
-│       │   └── unit_tests
-│       │       └── ...
-│       └── tools
-│           ├── bert_embedding
-│           └── checkpoint
-├── docker
-│   ├── common
-│   └── patches
-├── examples
-│   ├── conversion
-│   │   ├── adapter
-│   │   └── compare_hf_and_megatron
-│   ├── decentralized_pg
-│   ├── diffusion
-│   │   └── recipes
-│   │       ├── flux
-│   │       │   └── ...
-│   │       └── wan
-│   │           └── ...
-│   ├── distillation
-│   │   └── llama
-│   │       └── conf
-│   ├── evaluation
-│   │   └── utils
-│   ├── inference
-│   │   └── vlm
-│   ├── long_context
-│   ├── models
-│   │   ├── audio_lm
-│   │   │   ├── qwen2_audio
-│   │   │   └── qwen3_asr
-│   │   ├── bailing
-│   │   ├── gpt_oss
-│   │   ├── minimax_m2
-│   │   ├── nemotron_3
-│   │   │   ├── nano
-│   │   │   └── super
-│   │   ├── qwen3_next
-│   │   │   └── conf
-│   │   ├── sarvam
-│   │   └── vlm
-│   │       ├── gemma3_vl
-│   │       ├── glm_45v
-│   │       ├── kimi_k25_vl
-│   │       ├── ministral3
-│   │       ├── nemotron_vl
-│   │       │   └── ...
-│   │       ├── qwen25_omni
-│   │       ├── qwen35_vl
-│   │       ├── qwen3_vl
-│   │       └── qwen_vl
-│   │           └── ...
-│   ├── peft
-│   ├── quantization
-│   │   └── conf
-│   ├── resiliency
-│   │   ├── fault_tolerance
-│   │   └── straggler_detection
-│   └── rl
-├── scripts
-│   ├── performance
-│   │   ├── configs
-│   │   │   ├── deepseek
-│   │   │   ├── gpt_oss
-│   │   │   ├── kimi
-│   │   │   ├── llama
-│   │   │   ├── nemotronh
-│   │   │   ├── qwen
-│   │   │   └── qwen_vl
-│   │   └── utils
-│   └── training
-├── tests
-│   ├── functional_tests
-│   │   ├── data
-│   │   │   ├── energon
-│   │   │   └── hf_processors
-│   │   ├── diffusion
-│   │   │   ├── flux
-│   │   │   └── wan
-│   │   ├── inference
-│   │   ├── launch_scripts
-│   │   │   ├── active
-│   │   │   └── flaky
-│   │   ├── models
-│   │   │   ├── qwen3_asr
-│   │   │   └── qwen_audio
-│   │   └── test_groups
-│   │       ├── ckpts
-│   │       │   └── ...
-│   │       ├── converter
-│   │       ├── data
-│   │       │   └── ...
-│   │       ├── diffusion
-│   │       │   └── ...
-│   │       ├── models
-│   │       │   └── ...
-│   │       ├── quantization
-│   │       │   └── ...
-│   │       ├── recipes
-│   │       ├── training
-│   │       └── utils
-│   └── unit_tests
-│       ├── data
-│       │   ├── builders
-│       │   ├── datasets
-│       │   ├── energon
-│       │   ├── mimo
-│       │   └── vlm_datasets
-│       ├── diffusion
-│       │   ├── data
-│       │   │   └── ...
-│       │   ├── model
-│       │   │   └── ...
-│       │   └── recipes
-│       │       └── ...
-│       ├── inference
-│       │   └── vlm
-│       ├── models
-│       │   ├── common
-│       │   ├── decorators
-│       │   ├── deepseek
-│       │   ├── gemma
-│       │   ├── gemma_vl
-│       │   ├── glm
-│       │   ├── glm_vl
-│       │   ├── gpt
-│       │   ├── gpt_oss
-│       │   ├── hf_pretrained
-│       │   ├── kimi
-│       │   ├── kimi_vl
-│       │   ├── llama
-│       │   ├── llama_nemotron
-│       │   ├── mamba
-│       │   ├── mimo
-│       │   ├── minimax_m2
-│       │   ├── ministral3
-│       │   ├── mistral
-│       │   ├── nemotron
-│       │   ├── nemotron_vl
-│       │   ├── nemotronh
-│       │   ├── olmoe
-│       │   ├── qwen
-│       │   ├── qwen3_asr
-│       │   │   └── ...
-│       │   ├── qwen_audio
-│       │   ├── qwen_omni
-│       │   │   └── ...
-│       │   ├── qwen_vl
-│       │   │   └── ...
-│       │   └── sarvam
-│       ├── peft
-│       ├── recipes
-│       │   ├── gemma
-│       │   ├── gpt
-│       │   ├── kimi
-│       │   ├── nemotronh
-│       │   ├── qwen
-│       │   ├── qwen_vl
-│       │   │   └── ...
-│       │   └── utils
-│       ├── scripts
-│       │   └── performance
-│       ├── training
-│       │   ├── mimo
-│       │   ├── mlm_compat
-│       │   ├── post_training
-│       │   └── utils
-│       └── utils
-└── tutorials
-    ├── data
-    │   └── dclm
-    ├── recipes
-    │   └── llama
-    │       └── conf
-    └── training
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; depth cap 3; selected files shown.
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/process_groups_config.py
-Imports:
-  - from dataclasses import dataclass, field, fields
-  - from functools import partial
-  - from typing import List, Optional
-  - import torch
-  - from megatron.core import parallel_state
-  - from megatron.core.utils import get_model_config
-  - import logging
-  - from megatron.core.utils import log_single_rank
----
-Classes:
-  - ProcessGroupHelperMeta
-    Methods:
-      - L17: def __setattr__(cls, name, value):
-  - ProcessGroupCollection
-    Methods:
-      - L136: def __init__(self, **kwargs):
-      - L143: def __repr__(self):
-      - L161: def use_mpu_process_groups(cls, required_pgs: Optional[List[str]] = None):
-      - L253: def setup_process_groups_for_optimizer(
-        pg_collection: Optional['ProcessGroupCollection'],
-        model_chunks: List,
-        use_gloo_process_groups: bool = True,
-    ):
-      - L444: def setup_process_groups_for_ddp(
-        pg_collection: Optional['ProcessGroupCollection'], config, ddp_config
-    ):
-    Properties:
-      - tp
-      - pp
-      - mp
-      - embd
-      - pos_embd
-      - cp
-      - tp_cp
-      - hcp
-      - ep
-      - expt_tp
-      - tp_ep
-      - tp_ep_pp
-      - tp_dp_cp
-      - dp
-      - dp_cp
-      - expt_dp
-      - intra_dp_cp
-      - intra_expt_dp
-      - inter_dist_opt
-      - intra_dist_opt
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/training/resilience_config.py
-Imports:
-  - from dataclasses import dataclass
-  - from typing import Literal
----
-Classes:
-  - RerunStateMachineConfig
-    Properties:
-      - error_injection_rate
-      - error_injection_type
-      - rerun_mode
-      - check_for_nan_in_loss
-      - check_for_spiky_loss
-  - StragglerDetectionConfig
-    Properties:
-      - log_straggler
-      - straggler_ctrlr_port
-      - straggler_minmax_count
-      - disable_straggler_on_startup
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/training/common_config.py
-Imports:
-  - from dataclasses import dataclass, field
-  - from typing import Literal
-  - import os
----
-Classes:
-  - RNGConfig
-    Properties:
-      - seed
-      - te_rng_tracker
-      - inference_rng_tracker
-      - data_parallel_random_init
-  - ProfilingConfig
-    Properties:
-      - use_nsys_profiler
-      - profile_step_start
-      - profile_step_end
-      - use_pytorch_profiler
-      - pytorch_profiler_collect_shapes
-      - pytorch_profiler_collect_callstack
-      - pytorch_profiler_collect_chakra
-      - profile_ranks
-      - record_memory_history
-      - memory_snapshot_path
-      - record_shapes
-      - nvtx_ranges
-  - DistributedInitConfig
-    Properties:
-      - distributed_backend
-      - distributed_timeout_minutes
-      - align_grad_reduce
-      - local_rank
-      - lazy_mpu_init
-      - use_megatron_fsdp
-      - use_torch_fsdp2
-      - nccl_communicator_config_path
-      - use_tp_pp_dp_mapping
-      - enable_gloo_process_groups
-      - use_sharp
-      - sharp_enabled_group
-      - high_priority_stream_groups
-      - distributed_timeout_seconds_after_init
-      - disable_jit_fuser
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/distributed/distributed_data_parallel_config.py
-Imports:
-  - from dataclasses import dataclass
-  - from typing import Optional
-  - import os
----
-Classes:
-  - DistributedDataParallelConfig
-    Methods:
-      - L168: def __post_init__(self):
-    Properties:
-      - grad_reduce_in_fp32
-      - overlap_grad_reduce
-      - overlap_param_gather
-      - align_param_gather
-      - use_distributed_optimizer
-      - num_distributed_optimizer_instances
-      - check_for_nan_in_grad
-      - check_for_large_grads
-      - bucket_size
-      - pad_buckets_for_high_nccl_busbw
-      - reduce_scatter_with_fp32_accumulation
-      - average_in_collective
-      - fp8_param_gather
-      - reuse_grad_buf_for_mxfp8_param_ag
-      - use_megatron_fsdp
-      - use_custom_fsdp
-      - data_parallel_sharding_strategy
-      - gradient_reduce_div_fusion
-      - suggested_communication_unit_size
-      - preserve_fp32_weights
-      - keep_fp8_transpose_cache
-      - nccl_ub
-      - fsdp_double_buffer
-      - fsdp_db_use_persist_buf_on_alloc_fail
-      - fsdp_all_gather_in_start_param_sync
-      - outer_dp_sharding_strategy
-      - disable_symmetric_registration
-      - fsdp_manual_registration
-      - delay_wgrad_compute
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/models/transformer_config.py
-Imports:
-  - import copy
-  - from dataclasses import dataclass, fields, is_dataclass
-  - from megatron.core.transformer.heterogeneous.heterogeneous_config import (
-    HeterogeneousTransformerConfig as MCoreHeterogeneousTransformerConfig,
-)
-  - from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig
-  - from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig
-  - from megatron.bridge.utils.activation_map import str_to_callable
-  - from megatron.bridge.utils.activation_map import str_to_dtype
----
-Classes:
-  - TransformerConfig
-    Methods:
-      - L97: def __post_init__(self) -> None:
-      - L106: def finalize(self) -> None:
-      - L127: def __deepcopy__(self, memo):
-      - L146: def asdict(self) -> dict:
-    Properties:
-      - _NO_COPY_KEYS
-  - MLATransformerConfig
-    Methods:
-      - L172: def __post_init__(self) -> None:
-      - L181: def finalize(self) -> None:
-  - HeterogeneousTransformerConfig
-    Methods:
-      - L227: def __post_init__(self) -> None:
-      - L236: def finalize(self) -> None:
-      - L248: def get_config_for_layer(self, layer_number: int) -> MCoreTransformerConfig:
-
-Functions:
-  - L31: def _safe_asdict(obj, skip_keys: set[str]) -> dict:
-  - L51: def _resolve_string_fields(config: MCoreTransformerConfig) -> None:
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/optimizer/optimizer_config.py
-Imports:
-  - import fnmatch
-  - from dataclasses import dataclass, field
-  - from typing import Callable, Optional, Tuple, Union
-  - import torch
-  - from ..utils import is_te_min_version
-  - import warnings
-  - import inspect
-  - from transformer_engine.pytorch.optimizers import FusedAdam as Adam
----
-Classes:
-  - ParamPredicate
-    Methods:
-      - L32: def __call__(self, param: torch.nn.Parameter) -> bool:
-    Properties:
-      - name
-      - fn
-  - ParamWithNamePredicate
-    Methods:
-      - L60: def __call__(self, param: torch.nn.Parameter, name: str) -> bool:
-    Properties:
-      - name
-      - fn
-  - ParamKey
-    Methods:
-      - L89: def matches(self, param: torch.nn.Parameter, param_name: str) -> bool:
-    Properties:
-      - name
-      - attr
-      - predicate
-      - with_name_predicate
-  - OptimizerConfig
-    Methods:
-      - L346: def __post_init__(self):
-    Properties:
-      - lr
-      - min_lr
-      - decoupled_lr
-      - decoupled_min_lr
-      - weight_decay
-      - apply_wd_to_qk_layernorm
-      - fp8_recipe
-      - fp16
-      - bf16
-      - reuse_grad_buf_for_mxfp8_param_ag
-      - params_dtype
-      - use_precision_aware_optimizer
-      - store_param_remainders
-      - main_grads_dtype
-      - main_params_dtype
-      - exp_avg_dtype
-      - exp_avg_sq_dtype
-      - optimizer
-      - loss_scale
-      - initial_loss_scale
-      - min_loss_scale
-      - loss_scale_window
-      - hysteresis
-      - adam_beta1
-      - adam_beta2
-      - adam_eps
-      - decoupled_weight_decay
-      - sgd_momentum
-      - muon_momentum
-      - muon_split_qkv
-      - muon_use_nesterov
-      - muon_scale_mode
-      - muon_fp32_matmul_prec
-      - muon_num_ns_steps
-      - muon_tp_mode
-      - muon_extra_scale_factor
-      - use_distributed_optimizer
-      - overlap_param_gather
-      - overlap_param_gather_with_optimizer_step
-      - optimizer_cpu_offload
-      - optimizer_offload_fraction
-      - use_torch_optimizer_for_cpu_offload
-      - overlap_cpu_optimizer_d2h_h2d
-      - pin_cpu_grads
-      - pin_cpu_params
-      - clip_grad
-      - log_num_zeros_in_grad
-      - barrier_with_L1_time
-      - timers
-      - config_logger_dir
-  - AdamOptimizerConfig
-    Properties:
-      - optimizer
-      - adam_beta1
-      - adam_beta2
-      - adam_eps
-  - SGDOptimizerConfig
-    Properties:
-      - optimizer
-      - sgd_momentum
----
-
-
-File: /Users/mromeijn/src/Nemotron/src/nemotron/kit/megatron_stub.py
-Imports:
-  - from dataclasses import dataclass, field
-  - from pathlib import Path
----
-Classes:
-  - DataConfig
-    Properties:
-      - data_path
-      - mock
-      - seq_length
-      - micro_batch_size
-      - global_batch_size
-  - ModelConfig
-    Properties:
-      - name
-      - num_layers
-      - hidden_size
-      - num_attention_heads
-      - ffn_hidden_size
-      - vocab_size
-  - OptimizerConfig
-    Properties:
-      - lr
-      - min_lr
-      - weight_decay
-      - adam_beta1
-      - adam_beta2
-  - TrainingConfig
-    Properties:
-      - max_steps
-      - log_interval
-      - eval_interval
-      - save_interval
-      - fp16
-      - bf16
-  - CheckpointConfig
-    Properties:
-      - dir
-      - save_on_train_end
-      - resume_from
-  - ConfigContainer
-    Properties:
-      - data
-      - model
-      - optimizer
-      - training
-      - checkpoint
----
-
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Megatron-Bridge/docs/parallelisms.md
-```md
-# Parallelisms Guide
-
-Megatron Bridge supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. These parallelism strategies are configured through model provider classes and leverage Megatron Core's implementation for performance and memory efficiency.
-
-## Data Parallelism
-
-Data Parallelism (DP) replicates the model across multiple GPUs. Data batches are evenly distributed between GPUs and the data-parallel GPUs process them independently. While the computation workload is efficiently distributed across GPUs, inter-GPU communication is required to keep the model replicas consistent between training steps.
-
-### Distributed Data Parallelism
-
-Distributed Data Parallelism (DDP) keeps the model copies consistent by synchronizing parameter gradients across data-parallel GPUs before each parameter update. More specifically, it sums the gradients of all model copies using all-reduce communication collectives.
-
-![Distributed Data Parallelism](images/ddp.gif)
-*Figure: Distributed Data Parallelism synchronizes gradients across multiple GPUs using all-reduce operations.*
-
-### Distributed Optimizer
-
-[Distributed optimizer](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/dist_optimizer.html) is a memory-optimized data-parallel deployment method. It shards the optimizer states and the high-precision master parameters across data-parallel GPUs instead of replicating them. At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. This approach significantly reduces the memory need of large-scale LLM training.
-
-### Enable Data Parallelism
-
-In Megatron Bridge, DDP is the default parallel deployment method. The total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group.
-
-To enable the distributed optimizer, configure the {py:class}`bridge.training.config.OptimizerConfig` and {py:class}`bridge.training.config.DistributedDataParallelConfig`
-
-```python
-from megatron.bridge.training.config import ConfigContainer, DistributedDataParallelConfig, OptimizerConfig
-
-optimizer_config = OptimizerConfig(
-    optimizer="adam",
-    lr=3e-4,
-    weight_decay=0.1,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    use_distributed_optimizer=True,
-    clip_grad=1.0,
-)
-ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True)
-
-config = ConfigContainer(
-    ddp=ddp_config,
-    optimizer=optimizer_config,
-    # ... other config parameters
-)
-```
-
-For more optimizer options, refer to the {py:class}`bridge.training.config.OptimizerConfig` API documentation.
-
-## Model Parallelism
-
-Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need for per-GPU memory. Megatron Bridge supports various model-parallel methods through Megatron Core, which can be mixed to maximize LLM training performance.
-
-### Tensor Parallelism
-
-Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads.
-
-![Tensor Parallelism Overview](images/tp1.png)
-*Figure 1: Tensor Parallelism distributes individual layer parameters across multiple GPUs.*
-
-![Tensor Parallelism Implementation](images/tp2.png)
-*Figure 2: Detailed view of how tensor parallelism splits weight matrices and synchronizes computations.*
-
-#### Enable Tensor Parallelism
-
-To enable TP in Megatron Bridge, configure the `tensor_model_parallel_size` parameter in your model provider. This parameter determines the number of GPUs among which the model's tensors are partitioned.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure model with tensor parallelism
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,  # Enable TP across 2 GPUs
-    # ... other model parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Implement Tensor Parallelism
-
-Megatron Bridge integrates TP through the implementation from Megatron Core. For detailed API usage and additional configurations, consult the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.tensor_parallel.html).
-
-### Pipeline Parallelism
-
-Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
-
-![Pipeline Parallelism](images/pp.gif)
-*Figure: Pipeline Parallelism distributes consecutive layers across multiple GPUs, processing batches in a pipeline fashion.*
-
-#### Enable Pipeline Parallelism
-
-To utilize Pipeline Parallelism in Megatron Bridge, set the `pipeline_model_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure model with pipeline parallelism
-model_config = GPTModelProvider(
-    pipeline_model_parallel_size=4,  # Distribute layers across 4 GPUs
-    # ... other model parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Interleaved Pipeline Parallel Schedule
-
-To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. Enable this by setting `virtual_pipeline_model_parallel_size`:
-
-```python
-model_config = GPTModelProvider(
-    pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=2,  # 2 model chunks per pipeline stage
-    # ... other model parameters
-)
-```
-
-For more insights into this approach, see the detailed blog: [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism).
-
-#### Implement Pipeline Parallelism
-
-The Megatron Bridge implementation of PP leverages functionalities from Megatron Core. For more detailed API usage and configurations related to PP, visit the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.pipeline_parallel.html).
-
-### Expert Parallelism and Mixture of Experts (MoE)
-
-Expert Parallelism (EP) is a type of model parallelism that distributes experts of a Mixture of Experts (MoE) model across GPUs. Unlike other model-parallel techniques, EP is applied to only the expert layers and does not impact the parallel mapping of the rest of the layers.
-
-MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert focuses on a specific subtask or domain, while a gating network dynamically activates the most appropriate expert based on the current input.
-
-![Expert Parallelism](images/ep.png)
-*Figure: Expert Parallelism distributes MoE experts across multiple GPUs while keeping other layers replicated.*
-
-#### Basic MoE Configuration
-
-To enable MoE in Megatron Bridge, configure the basic MoE parameters in your model provider:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure basic MoE model
-model_config = GPTModelProvider(
-    num_moe_experts=8,           # Number of experts in the MoE module
-    moe_router_topk=2,           # Number of experts activated per token
-    moe_ffn_hidden_size=8192,    # Hidden size for expert FFN layers
-    # ... other model parameters
-)
-```
-
-#### Enable Expert Parallelism
-
-To enable EP, set `expert_model_parallel_size` in your model configuration. For example, if the model has eight experts (`num_moe_experts=8`), then setting `expert_model_parallel_size=4` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
-
-```python
-# Configure MoE model with expert parallelism
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,  # Distribute 8 experts across 4 GPUs (2 experts per GPU)
-    # ... other model parameters
-)
-```
-
-#### Enable Expert Tensor Parallelism
-
-To enable Expert Tensor Parallelism (ETP), set `expert_tensor_parallel_size` in your model configuration:
-
-```python
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    expert_tensor_parallel_size=2,  # Apply tensor parallelism within each expert
-    # ... other model parameters
-)
-```
-
-#### Advanced MoE Features
-
-Megatron Bridge provides several advanced optimizations for MoE models to improve performance on modern GPU architectures.
-
-##### DeepEP and HybridEP Optimizations
-
-DeepEP and HybridEP are high-performance MoE token dispatchers that improve throughput and efficiency on specific GPU architectures:
-
-- **DeepEP**: Optimized for Ampere, Hopper, B200, and B300 GPUs
-- **HybridEP**: Optimized for GB200, GB300 with NVL72, and Ampere, Hopper, B200, B300 GPUs
-
-These dispatchers replace the standard token routing mechanism with an optimized "flex" dispatcher that provides better performance for MoE workloads.
-
-**Enable DeepEP:**
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    # ... other model parameters
-)
-
-# Apply DeepEP optimization
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep")
-```
-
-**Enable HybridEP:**
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    # ... other model parameters
-)
-
-# Apply HybridEP optimization
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="hybridep")
-```
-
-**GPU Architecture Requirements:**
-
-- **DeepEP**: Ampere (SM 8.x), Hopper (SM 9.x), B200, B300
-- **HybridEP**: GB200, GB300 with NVL72, Ampere (SM 8.x), Hopper (SM 9.x), B200, B300
-
-The system automatically validates GPU compatibility and issues warnings if the dispatcher is not supported on the current hardware.
-
-##### Token Dropping for Load Balancing
-
-Token dropping improves MoE performance by balancing work across experts through capacity factors. This feature allows the model to drop tokens when experts are overloaded, preventing stragglers and improving overall throughput.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    moe_router_topk=2,
-    moe_token_dispatcher_type="alltoall",  # Required for token dropping
-    moe_router_load_balancing_type="aux_loss",  # Required load balancing type
-    # ... other model parameters
-)
-
-# Apply token dropping with capacity factor
-apply_moe_token_drop(
-    model_config,
-    moe_expert_capacity_factor=1.0,  # Capacity multiplier per expert
-    moe_pad_expert_input_to_capacity=True,  # Pad inputs to capacity length
-)
-```
-
-**Configuration Parameters:**
-
-- `moe_expert_capacity_factor`: Controls the maximum number of tokens each expert can process. A factor of 1.0 means each expert can handle exactly its proportional share of tokens. Lower values (e.g., 0.8) drop more tokens but improve load balancing.
-- `moe_pad_expert_input_to_capacity`: When enabled, pads expert inputs to the capacity length for consistent batch sizes.
-
-**Requirements:**
-
-- Token dispatcher must be `alltoall` or `alltoall_seq`
-- Load balancing type must be `aux_loss`, `seq_aux_loss`, or `none`
-
-**Trade-offs:**
-
-Token dropping can improve training throughput by 10-30% in imbalanced MoE models, but may affect convergence if too aggressive. Start with a capacity factor of 1.0 and gradually reduce if needed.
-
-#### Complete MoE Configuration Example
-
-Here's a complete example showing how to configure an MoE model with advanced optimizations:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop
-
-# Configure MoE model with expert parallelism
-model_config = GPTModelProvider(
-    num_layers=32,
-    hidden_size=4096,
-    num_attention_heads=32,
-    
-    # MoE configuration
-    num_moe_experts=8,                    # 8 experts total
-    moe_router_topk=2,                    # Activate 2 experts per token
-    moe_ffn_hidden_size=8192,            # Expert FFN hidden dimension
-    moe_token_dispatcher_type="alltoall", # Token dispatcher type
-    moe_router_load_balancing_type="aux_loss",  # Load balancing
-    
-    # Expert parallelism
-    expert_model_parallel_size=4,         # Distribute experts across 4 GPUs
-    expert_tensor_parallel_size=2,        # Apply TP within each expert
-    
-    # ... other model parameters
-)
-
-# Apply DeepEP optimization (for Ampere/Hopper GPUs)
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep")
-
-# Apply token dropping for load balancing
-apply_moe_token_drop(
-    model_config,
-    moe_expert_capacity_factor=1.0,
-    moe_pad_expert_input_to_capacity=True,
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Expert Parallelism Implementation
-
-The Megatron Bridge implementation of EP uses functionality from Megatron Core. Please consult the [Megatron Core MoE layer](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/moe_layer.py#L42) for more MoE implementation details.
-
-## Activation Partitioning
-
-In LLM training, a large memory space is needed to store the input activations of the network layers. Megatron Bridge provides effective activation distribution methods through Megatron Core, which is critical in training LLMs with large sequence lengths or large per-GPU micro-batch sizes.
-
-### Sequence Parallelism
-
-Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
-
-![Sequence Parallelism](images/sp.png)
-*Figure: Sequence Parallelism distributes the sequence dimension across multiple GPUs, reducing activation memory.*
-
-#### Enable Sequence Parallelism
-
-To utilize SP in Megatron Bridge, set the `sequence_parallel` parameter to `True` in your model configuration. Note that this feature is effective only when the tensor parallel size (`tensor_model_parallel_size`) is greater than `1`.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure model with sequence parallelism
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,  # Required for sequence parallelism
-    sequence_parallel=True,        # Enable sequence parallelism
-    # ... other model parameters
-)
-```
-
-#### Implement Sequence Parallelism
-
-The Megatron Bridge implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code: [Megatron-LM Sequence Parallel Source Code](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py).
-
-### Context Parallelism
-
-Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs by partitioning the input tensors along the sequence dimension. Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers.
-
-CP is critical for training long context models, as it allows the model to handle longer sequences by distributing the sequence activations across multiple GPUs. This method reduces the memory footprint and computational cost of processing long sequences.
-
-#### Enable Context Parallelism
-
-To activate CP in Megatron Bridge, set the `context_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs across which the model's sequence activations are distributed.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure model with context parallelism
-model_config = GPTModelProvider(
-    context_parallel_size=2,  # Distribute sequence across 2 GPUs
-    # ... other model parameters
-)
-```
-
-For long context training scenarios, context parallelism is particularly effective and essential for handling sequences that exceed the memory capacity of individual GPUs.
-
-#### Implement Context Parallelism
-
-Megatron Bridge leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
-
-For more detailed technical information and implementation details, visit:
-- [Megatron Core Context Parallelism Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/context_parallel.html)
-- [Megatron Core wrappers for Transformer Engine](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py)
-- [Transformer Engine attention modules](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py)
-
-## Combined Parallelism Example
-
-Megatron Bridge allows you to combine multiple parallelism strategies for optimal performance and memory efficiency:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer, OptimizerConfig
-
-# Configure model with multiple parallelism strategies
-model_config = GPTModelProvider(
-    # Model parallelism
-    tensor_model_parallel_size=2,      # 2-way tensor parallelism
-    pipeline_model_parallel_size=4,    # 4-way pipeline parallelism
-    virtual_pipeline_model_parallel_size=2,  # Interleaved pipeline
-    
-    # Activation partitioning
-    sequence_parallel=True,            # Enable sequence parallelism (requires TP > 1)
-    context_parallel_size=2,           # 2-way context parallelism
-    
-    # Expert parallelism (for MoE models)
-    num_moe_experts=8,                 # 8 experts
-    expert_model_parallel_size=4,      # Distribute experts across 4 GPUs
-    
-    # ... other model parameters
-)
-
-# Configure distributed optimizer
-optimizer_config = OptimizerConfig(
-    optimizer="adam",
-    use_distributed_optimizer=True,    # Enable distributed optimizer
-    # ... other optimizer parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    optimizer=optimizer_config,
-    # ... other config parameters
-)
-```
-
-## Data Parallel Size Calculation
-
-The data parallel size is automatically calculated based on the total world size and model parallelism settings:
-
-```
-data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size)
-```
-
-For example, with 32 GPUs total and the configuration above:
-- `tensor_model_parallel_size = 2`
-- `pipeline_model_parallel_size = 4` 
-- `context_parallel_size = 2`
-- `data_parallel_size = 32 / (2 × 4 × 2) = 2`
-
-## Strategy Selection Guide
-
-Choosing the right combination depends on model size, hardware topology,
-and sequence length.
-
-### Dense Models by Size
-
-| Model size | GPUs | Recommended starting point |
-|---|---|---|
-| < 1B | 1-8 | DP only |
-| 1-10B | 8-16 | TP=2-4 + DP |
-| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP |
-| 70-175B | 64-256 | TP=8 + PP=4-8 + DP |
-| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP |
-
-### MoE Models
-
-MoE models differ fundamentally from dense models: only a fraction of
-parameters are active per token, so TP can often stay at 1 or 2. EP is
-the primary scaling dimension.
-
-| Total / active params | Typical layout |
-|---|---|
-| < 20B | EP only (TP=1, PP=1) |
-| 20-100B | TP=1-2 + PP=2-4 + EP=8-16 |
-| 100-500B | TP=2-4 + PP=8-16 + EP=8-32 |
-| 500B+ | TP=2 + PP=16 + EP=32-64 |
-
-### By Hardware Topology
-
-- **Single node with NVLink**: maximize TP within the node (up to TP=8).
-- **Multiple nodes with InfiniBand**: keep TP within a node, use PP across nodes.
-- **Limited network (Ethernet)**: minimize TP, prefer PP for cross-node scaling.
-
-### By Sequence Length
-
-| Sequence length | Recommendation |
-|---|---|
-| < 2K | standard TP + PP + DP |
-| 2K-8K | add SP (`sequence_parallel=True`) |
-| 8K-32K | add CP=2 |
-| 32K+ | add CP=4-8, consider hierarchical CP |
-
-For operational details on configuring combined parallelism, troubleshooting
-layouts, and memory estimation, see the
-[parallelism strategies skill](skills/perf-techniques/parallelism-strategies/SKILL.md).
-
-## Configuration Guidelines
-
-### Memory Optimization
-- Use **distributed optimizer** to reduce optimizer state memory
-- Enable **sequence parallelism** when using tensor parallelism to reduce activation memory
-- Use **context parallelism** for long sequence training
-- Consider **pipeline parallelism** for very large models that don't fit on a single GPU
-
-### Performance Optimization
-- **Tensor parallelism** works best within a single node (high bandwidth)
-- **Pipeline parallelism** can work across nodes but requires careful batch size tuning
-- **Context parallelism** is essential for long context scenarios
-- **Expert parallelism** is specific to MoE models and should match the number of experts
-- **DeepEP/HybridEP** provide optimized MoE token dispatching on supported GPU architectures
-
-### Compatibility
-- **Sequence parallelism** requires `tensor_model_parallel_size > 1`
-- **Expert parallelism** requires MoE models (`num_moe_experts > 0`)
-- **DeepEP** requires Ampere, Hopper, B200, or B300 GPUs
-- **HybridEP** requires GB200, GB300 with NVL72, or Ampere, Hopper, B200, B300 GPUs
-- **Token dropping** requires `alltoall` or `alltoall_seq` token dispatcher
-- All parallelism strategies can be combined, but total parallelism must divide evenly into the world size
-
-## Related Artifacts
-
-- **Operational skill**: [skills/perf-techniques/parallelism-strategies/SKILL.md](skills/perf-techniques/parallelism-strategies/SKILL.md) — enablement, pitfalls, memory estimation, verification
-- **Knowledge card**: [skills/perf-techniques/parallelism-strategies/card.yaml](skills/perf-techniques/parallelism-strategies/card.yaml) — structured metadata and validation status
-
-## Resources
-
-- [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/)
-- [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/)
-- [Megatron-LM Repository](https://github.com/NVIDIA/Megatron-LM)
-- [Transformer Engine](https://github.com/NVIDIA/TransformerEngine)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/performance-guide.md
-```md
-# Performance Tuning Guide
-
-Megatron-Bridge provides a wide range of features for performant and memory-efficient LLM training on GPUs, and comes pre-configured with optimal settings. However, factors such as model architecture, hyperparameters, GPU count, and GPU type can affect the available options, and additional tuning may be necessary to achieve optimal performance. This document explores the factors that affect training performance, highlights common issues, and outlines techniques for performance tuning that lead to higher MFU (Model FLOPS Utilization) and TCO.
-
-```{Note}
-This guide makes references to several configuration settings. These settings will be referenced relative to the the config class that contains them, e.g. `OptimizerConfig.lr`. Please see <project:apidocs/index.rst> for more details on configuration settings.
-```
-
-```{Note}
-This guide references several configuration settings from `TransformerConfig`. Please apply these to the appropriate ModelProvider for your model, e.g. `GPTModelProvider`, as the `ConfigContainer` does not accept a raw `TransformerConfig`.
-```
-
-## Low Precision Training
-
-1. Expected speedup of FP8 training compared to BF16 training
-
-   > 1. The default low-precision LLM training recipe applies FP8 computation exclusively to the linear layers within the Transformer block, typically achieving a speedup of 1.2–1.5X.
-   > 2. However, the actual speedup depends on the proportion of training time spent on these linear layers. For instance, smaller LLMs with a limited hidden size exhibit lower FP8 speedup, as linear layers scale with O(sequence_length × hidden_size²) complexity, whereas the other element-wise computation layers (e.g., layer norms, dropouts, RoPE, and simple math functions) scale with O(sequence_length × hidden_size), and dot-product attention scales with O(sequence_length² × hidden_size). Consequently, the contribution of linear layers to the overall training time is smaller in such models.
-   > 3. Different FP8 recipes use varying quantization block sizes, affecting performance. Smaller quantization blocks generally incur higher overhead in both quantization and GEMM execution. For example, MXFP8 with a 1×32 quantization block performs less efficiently than full tensor-wise FP8 scaling.
-
-2. Common issues of low FP8 training speedup
-
-   > 1. Host performance boundness when LLM uses small GPU kernels (see [Lowering Host Overhead and Jitters](#lowering-overhead-jitter)).
-   > 2. A low proportion of linear layers in training step time that use FP8 computation.
-
-## Parallel Mapping Strategies
-
-1. Data Parallelism using Distributed Optimizer
-
-   > 1. You should begin with data-parallel (DP) mapping. As long as the model and activation memory fit within the GPUs, data parallelism generally offers optimal performance, minimizes communication overhead, and maximizes per-GPU tensor sizes (compared to per-tensor sharding).
-   >
-   > 2. Megatron-Bridge uses the distributed optimizer as the default method for data-parallel training. It shards master parameters and optimizer states across data-parallel ranks, reducing model state memory usage without increasing communication overhead compared to traditional data-parallel training.
-   >
-   >    > 1. `OptimizerConfig.use_distributed_optimizer=true`
-
-2. Per-tensor Sharding (Tensor-parallel or Context-parallel mappings)
-
-   > 1. Tensor parallelism (TP) is the primary recommendation when a model exceeds GPU memory capacity under data-parallel mapping. However, since it involves higher communication overhead, the tensor-parallel size should ideally be confined to the high-bandwidth intra-node network (NVLink domain).
-   >
-   >    > 1. `TransformerConfig.tensor_model_parallel_size=<int>`
-   >
-   > 2. When the sequence length in a training run is significantly larger than the hidden size, activation memory can overflow. In such cases, context parallelism (CP) helps by sharding tensors along the sequence dimension, allowing the workload to fit within limited GPU memory and improving performance. Like tensor parallelism (TP), CP requires inter-GPU communication of activations. However, for the same tensor sizes, CP generally results in lower communication volume.
-
-That said, CP’s effectiveness depends on the relative sizes of the sequence length and hidden size. When the sequence length is smaller than the hidden size, CP produces narrow (or "skinny") tensor shards on each GPU. This reduces data reuse and can degrade performance.
-
-Additionally, because CP shards activations, it also partitions optimizer states in distributed training. As a result, optimizer state partitioning spans both the data parallel (DP) and context parallel (CP) dimensions.
-
-> > 1. `TransformerConfig.context_parallel_size=<int>`
->
-> 1. Performance tips:
->
->    > 1. A large tensor-parallel or context-parallel size is not recommended unless the hidden size or sequence length is large enough to maintain sufficient per-GPU parallelism and avoid excessive communication overhead. For example, using a tensor-parallel size of 8 for LLAMA 3 70B could lead to low GPU utilization and make training host-performance bound.
->    > 2. You can combine TP and CP to optimize performance by balancing communication overhead. For example, using TP=2 along with CP=2 can give better performance than TP=4 when the sequence size is larger than the hidden size.
->    > 3. For additional tips, see [Long Sequence Training](#long-sequence-train).
-
-1. Pipeline Parallelism
-
-   > 1. Pipeline parallelism (PP) is necessary when a model cannot fit within GPU memory using tensor parallelism. Also, virtual pipeline parallelism (VPP) should be used in conjunction with pipeline parallelism to reduce the overhead caused by pipeline warm-up and flush bubbles.
-   >
-   >    > 1. `TransformerConfig.pipeline_model_parallel_size=<int>`
-   >    > 2. `TransformerConfig.virtual_pipeline_model_parallel_size=<int>`
-   >
-   > 2. Performance tips in PP and VPP sizing:
-   >
-   >    > 1. PP can also be combined with per-tensor sharding methods to mitigate the impact of sharding inefficiencies and pipeline bubbles. For instance, TP4 + PP2 may outperform TP8 when both mappings fit into memory because using a large TP reduces per-GPU tensor sizes but increases the communication cost, increasing the exposed communication.
-   >    > 2. VPP increases inter-stage communication overhead. When a global batch contains many micro-batches, using a smaller VPP size can improve performance, as the exposed communication cost outweighs the reduction in pipeline bubbles.
-   >
-   > 3. Asymmetric Transformer layer allocation across pipeline stages
-   >
-   >    > 1. An LLM with a large vocabulary size has computationally heavy embedding lookup and projection operations, leading to load imbalance across pipeline stages. To address this, Megatron-Bridge provides an option to allocate one fewer Transformer layer in the first and last pipeline stages, which handle embedding lookup and projection, to better balance workloads.
-   >    >
-   >    >    > 1. `GPTProvider.account_for_embedding_in_pipeline_split=true`
-   >    >    > 2. `GPTProvider.account_for_loss_in_pipeline_split=true`
-
-2. Expert Parallelism
-
-   > 1. Expert Parallelism (EP) is designed specifically for Mixture-of-Experts (MoE) models to efficiently distribute sparse MLP weights across multiple chips. It can be used in combination with other parallelism strategies such as Tensor Parallelism (TP), Context Parallelism (CP), Pipeline Parallelism (PP), Data Parallelism (DP), and Fully Sharded Data Parallel (FSDP). In the current design, the dense attention part and the sparse MLP part are fully decoupled in terms of their TP, CP, and DP parallelism configurations. Expert Tensor Parallelism (ETP) is introduced to specifically control the tensor parallelism for the sparse MLP part. ETP uses TP for dense layers for the ranks allocated for EP in sparse layers. On the other hand, the baseline is DEP, which folds DP in dense layers for EP in sparse layers.
-   >
-   >    > 1. `TransformerConfig.expert_model_parallel_size=<int>`
-   >    > 2. `TransformerConfig.expert_tensor_parallel_size=<int>`
-   >
-   > 2. Performance tips in hybrid folding options and EP sizing:
-   >
-   >    > 1. Typically, EP is kept within the high-bandwidth intra-node network (NVLink domain) to minimize the communication overhead it can introduce. However, using communication overlap techniques—such as pipeline overlap or 1F1B overlap—along with PP (e.g., DualPipe) might make it possible to expand EP into the inter-node networks.
-   >    >
-   >    > 2. Within the sparse MLP block, DP replaces CP because it has no impact on the computation pattern based on the dispatched tokens in each EP rank.
-   >    >
-   >    > 3. Usually, ETP is set to 1 to avoid significant communication overhead that comes with applying TP to MLP GEMMs.
-   >    >
-   >    > 4. When multiple experts are placed on a single chip after applying Expert Parallelism, enabling grouped GEMM can significantly improve computation efficiency.
-   >    >
-   >    >    > 1. `TransformerConfig.moe_grouped_gemm=True`
-
-3. Fully Sharded Data Parallelism
-
-   > 1. Megatron-Bridge supports PyTorch-native FSDP. FSDP can be used in combination with per-tensor sharding methods.
-   >
-   >    > 1. To use PyTorch FSDP2:
-   >    >
-   >    >    > 1. `DistributedInitConfig.use_torch_fsdp2=True`
-   >
-   > 2. FSDP can be preferred over TP+PP+DP mappings in the following scenarios:
-   >
-   >    > 1. Small models with a large sequence, thus the parameter AllGather and gradient ReduceScatter can effectively be hidden under computation and the short communication overlap causes minor interference to the computation under overlap.
-   >    > 2. In FSDP training, activation storage remains as the main memory bottleneck because FSDP only shards model state memory, and a large per-GPU activation is needed to hide the costly FSDP communication. On GB200 GPUs, Megatron-Bridge offers an option to offload activations to the host memory via a high-speed chip-to-chip interconnect.
-   >    > 3. Baseline training is host performance-bound, but FSDP allows for larger per-GPU tensor sizes by eliminating TP or enabling a larger micro-batch size.
-
-   <!-- TODO: support megatron custom fsdp -->
-   <!-- > 1. Megatron-Bridge supports two Fully Sharded Data Parallelism (FSDP) implementations: PyTorch-native FSDP and a custom Megatron FSDP built within Megatron Core. While both follow the same sharding principles, the custom implementation is further optimized for performance. The performance gain of the custom FSDP comes primarily from minimizing the data movement to the communication tensors and reusing communication buffers. Both FSDP methods can be used in combination with per-tensor sharding methods. -->
-   <!-- > -->
-   <!-- >    > 1. To use PyTorch FSDP2: -->
-   <!-- >    > -->
-   <!-- >    >    > 1. `DistributedInitConfig.use_torch_fsdp2=True` -->
-   <!-- >    > -->
-   <!-- >    > 2. To use Custom Megatron FSDP: -->
-   <!-- >    > -->
-   <!-- >    >    > 1. `recipe.trainer.strategy.fsdp="megatron"` -->
-   <!-- >    >    > 2. `recipe.trainer.strategy.ddp.data_parallel_sharding_strategy="optim_grads_params"` -->
-   <!-- > -->
-   <!-- > 2. FSDP can be preferred over TP+PP+DP mappings in the following scenarios: -->
-   <!-- > -->
-   <!-- >    > 1. Small models with a large sequence, thus the parameter AllGather and gradient ReduceScatter can effectively be hidden under computation and the short communication overlap causes minor interference to the computation under overlap. -->
-   <!-- >    > 2. In FSDP training, activation storage remains as the main memory bottleneck because FSDP only shards model state memory, and a large per-GPU activation is needed to hide the costly FSDP communication. On GB200 GPUs, Megatron-Bridge offers an option to offload activations to the host memory via a high-speed chip-to-chip interconnect. -->
-   <!-- >    > 3. Baseline training is host performance-bound, but FSDP allows for larger per-GPU tensor sizes by eliminating TP or enabling a larger micro-batch size. -->
-
-4. Heterogeneous Encoder Parallelism
-
-   > 1. Encoder Pipeline Parallel
-   >
-   >    > 1. Use `T5ModelProvider.encoder_pipeline_model_parallel_size`.
-   >    > 2. In an Encoder-Decoder architecture like Multimodal models (VLMs like NeVA etc.), Encoder Pipeline Parallel can be used to add pipeline parallelism to the encoder.
-   >    > 3. Pipeline parallelism controls the amount of pipelining in the decoder part.
-   >    > 4. Encoder Pipeline Parallel is limited to 1 at the moment, i.e., the encoder can occupy a maximum of 1 PP stage.
-   >    > 5. By default, Encoder Pipeline Parallel is 0 and Decoder Pipeline Parallel is 1.
-   >    > 6. When the Encoder Pipeline Parallel size is 0, it shares the first PP stage of the Decoder.
-   >
-   > 2. Encoder Tensor Parallel
-   >
-   >    > 1. Use `T5ModelProvider.encoder_tensor_model_parallel_size`.
-   >    > 2. Since encoders tend to be much smaller than decoders, we also provide the ability to set a different amount of tensor parallelism to the encoder than the decoder.
-   >    > 3. By default, encoder tensor parallel is set to 0, i.e., the amount of tensor parallelism in the encoder is equal to tensor parallelism in the decoder.
-   >    > 4. To use this option, Encoder Pipeline Parallel must be greater than 0 as we need the encoder to be on its own pipeline stage.
-   >    > 5. Encoder Tensor Parallel size is limited to be less than or equal to Tensor parallel size.
-   >
-   > 3. Total number of GPUs required when these features are used is:
-   >
-   >    > 1. Data Parallel size * Context Parallel size * ((Encoder TP * Encoder PP) + (Decoder TP * Decoder PP))
-   >
-   > 4. These features are experimental and may still have bugs. There are critical bug fixes that will be made in a future release.
-
-5. Parallel mapping strategies with NVL72
-
-   > 1. Training with only data parallelism or FSDP makes it straightforward to fully utilize the bandwidth of an NVL72 system. However, when combining multiple parallelism strategies, it's important to ensure that high-volume communicators remain confined within each NVL72 domain. For example, with TP=4, DP=16, and PP=4, the GPUs in the first TP group of DP1/PP1 spans both NVLink and network domains, causing communication performance to be bottlenecked by the slower network link. To avoid this, you may choose TP and DP sizes such that the product of TP × DP divides evenly into the NVL72 configuration. If the model-parallel size does not align naturally, padding may be required to support non-divisible group sizes.
-   > 2. To avoid this partitioning complexity, you can just use 64 GPUs out of the 72 GPUs.
-
-## Communication Overlaps and Tuning
-
-1. Data-parallel communication of Distributed Optimizer
-
-   > 1. Distributed optimizer overlaps parameter AllGathers with the forward computation of the first micro-batch and gradient ReduceScatters with the backward computation of the last micro-batch.
-   >
-   >    > 1. `DistributedDataParallelConfig.overlap_param_gather=true`
-   >    > 2. `DistributedDataParallelConfig.overlap_grad_reduce=true`
-   >
-   > 2. When using the distributed optimizer with pipeline parallelism (PP) + virtual pipeline parallelism (VPP), DP communications overlap with multiple micro-batches, increasing the opportunity for effective overlap. Also, Megatron-Bridge aligns the execution timing of DP communications across pipeline-parallel ranks to synchronize the computing kernel slowdown from the overlap.
-   >
-   >    > 1. `DistributedDataParallelConfig.align_param_gather=true`
-   >
-   > 3. Slow DP communication at large scaling training:
-   >
-   >    > 1. Distributing optimizer states across a partial DP domain reduces communication costs over high-latency Ethernet networks. Model states remain replicated outside the distributed domain. During the final micro-batch backpropagation, gradient ReduceScatters occur within the distributed domain, followed by AllReduce in the non-distributed domain. Parameter AllGathers are performed only within the distributed domain.
-   >    >
-   >    >    > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances= <int>`
-   >    >
-   >    > 2. A large message size for DP communication is recommended to maximize network bandwidth utilization. You can achieve this by increasing the communication bucket size.
-   >    >
-   >    >    > 1. `DistributedDataParallelConfig.bucket_size=<number_of_elements: int>`
-   >
-   > 4. A common reason for DP communication overlap failure:
-   >
-   >    > 1. Persistent Layer Normalization (LN) kernels from Transformer Engine use spin-waiting for all SMs in the GPU, causing the LN kernel and subsequent computation kernels to be scheduled only after DP communication. To prevent this, an appropriate SM margin should be configured using the following environment variables.
-   >    >
-   >    >    > 1. `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>`
-   >    >    > 2. `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>`
-
-<!-- 2. Custom Megatron FSDP -->
-
-<!--    > 1. Unless you specify the communication bucket size, MCORE FSDP uses fixed communication overlap that overlaps the parameter AllGather and gradient ReduceScatter of each Transformer layer with its associated forward and backward computations. -->
-
-3. Tensor-parallel (TP) communication (with sequence parallelism)
-
-   > 1. Megatron-Bridge currently uses the userbuffer backend in Transformer Engine for TP communication overlaps. This offers the pipelined overlap of the TP communication with dependent computation.
-   >
-   >    > 1. `CommOverlapConfig.tp_comm_overlap`
-   >
-   > 2. The overlap method, resource, and precision of the TP communication overlaps are configurable, and the most performant configurations are set in the Megatron-Bridge training recipes by default. Also, you can set a custom TP communication overlap configuration via the below interface following the structure of TransformerLayerTPOverlapCfg class.
-   >
-   >    > 1. `CommOverlapConfig.tp_comm_overlap_cfg=<TransformerLayerTPOverlapCfg>`
-   >
-   > 3. TP communication overlap setting tips
-   >
-   >    > 1. Balancing the number of SMs between communication and GEMM
-   >    >
-   >    >    > 1. For AllGather/ReduceScatter bulk and ReduceScatter pipelined overlap, you can adjust the number of SMs to balance communication and GEMM execution. Allocating too many SMs to communication may degrade GEMM performance, while too few may expose communication overhead. The default SM allocation for communication is 16, but you can fine-tune it based on profiling results.
-   >    >    > 2. `TPOverlapCfg.num_sm=<int>`
-   >    >
-   >    > 2. CGA sizing to improve SM utilization
-   >    >
-   >    >    > 1. The CGA size can be set between 1 and 4, but it should not exceed the number of SMs allocated for communication. We recommend using CGA ≤ 2 to prevent potential SM rasterization that could impact GEMM performance.
-   >    >    > 2. `TPOverlapCfg.cga_size=<int≤4>`
-   >    >
-   >    > 3. Use 4× splits for ReduceScatter and GEMM overlap to optimize the balance between GEMM efficiency and communication exposure.
-   >    >
-   >    >    > 1. In GEMM-then-ReduceScatter pipeline overlap, a 1× ReduceScatter chunk remains exposed. A small split size increases communication exposure, while a large split size may degrade performance due to aggregated GEMM wave quantization. We find that num_splits = 4 generally provides the best performance.
-   >    >    > 2. `TPOverlapCfg.num_split=<int>`
-   >
-   > 4. Common reason for TP comm overlap failure at Hopper
-   >
-   >    > 1. At H100 GPU, an environment variable `CUDA_DEVICE_MAX_CONNECTIONS=1` should be set. Otherwise, TP communication kernels can be scheduled at the end of GEMM to overlap with.
-   >    > 2. Pipelined TP communication overlap is used by a static userbuffer registered upon model initialization. Therefore, it doesn't support activation tensors dynamically changing between steps or between Transformer layers.
-
-4. Context-parallel (CP) communication
-
-   > 1. CP communication is configurable via "cp_comm_type", which can be "p2p", "all_gather", "a2a", or "a2a+p2p". Communications of "p2p" are implemented as ring-exchange send/receive operations, and they are hard-coded to overlap with the attention compute of sequence chunks. See [Long Sequence Training](#long-sequence-train) for more details.
-
-5. Expert-parallel communication
-
-   > 1. To hide the A2A/AG communication introduced by EP, pipeline split overlap or 1F1B overlap alongside Pipeline Parallelism could be possible. It will be added to Megatron-Bridge in future releases.
-
-6. Pipeline-parallel (PP) send/receive communication
-
-   > 1. PP send/recv in steady 1F1B states are set to be overlapped with computes by default.
-   > 2. The PP send/recv in warmup and flush are exposed by default.
-
-(comm-data-types)=
-## Communication Data Types
-
-1. FP8 data-parallel parameter AllGather in Distributed Optimizer and FSDP
-
-   > 1. Megatron-Bridge supports FP8 parameter AllGather for per-tensor FP8 scaling recipes. This operation is lossless, enhancing performance while reducing memory usage.
-   >
-   >    > 1. `MixedPrecisionConfig.fp8_param=true`
-
-2. BF16 (instead of FP32) data-parallel reduction in Distributed Optimizer and FSDP
-
-   > 1. We have validated that BF16 reduction is numerically safe across numerous model training runs. However, BF16 reduction with a large data-parallel size (e.g., DP ≥ 128), especially the Ring reduction algorithm—which accumulates copies sequentially—may impact numerical stability. When using SHARP with NVIDIA InfiniBand, BF16 reduction is more robust, as it performs binary additions with higher precision for intermediate partial reductions.
-   >
-   >    > 1. `DistributedDataParallelConfig.grad_reduce_in_fp32=false`
-
-3. FP8 tensor-parallel ReduceScatter
-
-   > 1. When communication latency exceeds GEMM execution time, using FP8 input ReduceScatter can better hide communication overhead. This approach has low numerical impact, as the GEMM output must be cast to FP8 and then converted back to high precision during reduction.
-   >
-   >    > 1. `TPOverlapCfg.fp8_buf=true`
-
-4. FP8 A2A Dispatch for expert parallel communication
-
-   > 1. Megatron-Bridge is working on supporting FP8 A2A dispatch (before expert FC1), but still keeps BF16 A2A combine (after expert FC2).
-
-## Performance at Scale
-
-1. Scaling a training job is typically achieved by increasing the size of the data-parallel domain. In large-scale training, this often results in a small number of micro-batches per global batch—or even a single micro-batch—causing most computations to overlap with data-parallel communication. To maintain high performance in such scenarios, you should focus on minimizing the overhead of data-parallel communication and reducing host-driven inter-GPU jitter.
-
-2. You can lower the overhead of data-parallel communication by (1) reducing the communication precision e.g., BF16 for gradient reduction and FP8 parameter gathering, (2) improving the efficiency of communication by increasing the data-parallel communication message size or using the hierarchical data-parallel reduction, or (3) using multi-cast and switch reduction with SHARP in case of InfiniBand network.
-
-   > 1. Using BF16 gradient reduction and FP8 parameter gather are described in [Communication Data Types](#comm-data-types)
-   >
-   > 2. For non-pipeline-parallel training, the data-parallel communication bucket size can be adjusted using the knobs below. In pipeline-parallel training, however, the bucket size is fixed and determined by the number of parameters assigned to each virtual pipeline rank.
-   >
-   >    > 1. `DistributedDataParallelConfig.bucket_size=<int: bytes>`
-   >
-   > 3. Setting the knob below splits the data-parallel domain of the distributed optimizer into a sharding domain and a replication domain. Gradient reduction then occurs in two stages—one within each domain—avoiding the use of a single large flat ring for collective operations that have high latency.
-   >
-   >    > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances=<int: ≤dp_size>`
-
-3. Ideas to reduce the host-driven inter-GPU jitters are discussed in [Lowering Host Overhead and Jitters](#lowering-overhead-jitter).
-
-(lowering-overhead-jitter)=
-## Lowering Host Overhead and Jitters
-
-1. Common observation associated with host overhead
-
-   > 1. Significantly low GPU FLOPS.
-   > 2. Small performance gain of low-precision (FP8) training.
-   > 3. Small LLMs with small hidden size or sequence length or fine-tuning without sequence packing
-   > 4. High multi-GPU communication variation.
-
-2. Increasing micro-batch size and reduce per-tensor sharding
-
-   > 1. The most common way to increase per-GPU tensor size is by increasing the micro-batch size or minimizing unnecessary per-tensor sharding (e.g., TP or CP) when GPU memory permits.
-
-3. Manual garbage collection to align the host interruption across GPUs
-
-   > 1. Megatron-Bridge manually aligns the timing of garbage collection across GPUs that significantly mitigate the host overhead compared to the baseline automatic garbage collection.
-   >
-   >    > 1. `TrainingConfig.manual_gc_interval=<int>`
-
-4. CUDA graph to eliminate repeated static host code execution
-
-   > 1. Megatron-Bridge supports graph capture, significantly reducing host overhead. CUDA Graph is applicable only to LLMs with a static tensor shape across training steps. For example, it supports fixed-size packed sequences but does not handle sequences with varying lengths at each step. Also, MoE models with token-dropless propagation have limited CUDA graph support, restricted to the dense modules only.
-   > 2. CUDA graph requires additional memory for static buffer management, typically adding a few gigabytes for static buffers, while models with PP size > 1 may consume over 10GB. We are actively working to reduce this memory overhead.
-   > 3. See [CUDA Graphs](training/cuda-graphs.md) for configuration details (`cuda_graph_impl`, `cuda_graph_scope`).
-
-5. Bind CPU memory for GPU processes
-
-   > 1. Binding CPU cores to GPU processes helps mitigate long latency issues and ensures minimal variation in GPU queuing latency across GPUs. This optimization significantly impacts, particularly when the communication domain size is large.
-   > 2. Example command line for a X86-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/4)) --membind=$((SLURM_LOCALID/4)) <run script>`
-   > 3. Example command line for a Grace-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/2)) --membind=$((SLURM_LOCALID/2)) <run script>`
-
-(reducing-memory-overflow)=
-## Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency
-
-1. Activation recomputation
-
-   > 1. Megatron-Bridge LLMs default to dot-product attention-only recomputation using Flash Attention, efficiently regenerating large intermediate activations from the attention operation with minimal computational overhead.
-   >
-   > 2. Megatron-Bridge also supports recomputing the full intermediate activations of a Transformer block, significantly reducing activation memory usage at the cost of approximately 30% additional computation. The number of Transformer blocks to recompute can be adjusted using a configurable setting.
-   >
-   >    > 1. `TransformerConfig.recompute_granuality=full`
-   >    > 2. `TransformerConfig.recompute_method=block`
-   >    > 3. `TransformerConfig.recompute_num_layers=<int:≤num_layers_in_the_model>`
-
-2. Activation offloading to host memory
-
-   > 1. Megatron-Bridge supports offloading activation memory to host memory, essential for training tasks constrained by activation memory. This is particularly useful for scenarios like (1) FSDP, where model state memory is minimized through sharding but activation memory remains high, (2) LoRA, which has frozen parameters but significant activation memory demands, and (3) the training with a large sequence length. The efficiency of activation offloading depends on both the interconnect bandwidth between the GPU and host and the host memory bandwidth. From this perspective, Grace-based systems like the GB200 enhance offloading performance by optimizing these bandwidths.
-   >
-   > 2. The following knobs should be configured to enable offloading and specify the number of Transformer layers to offload to host memory. The maximum number of layers that can be offloaded depends on host memory capacity, which may be lower when the CPU is shared among multiple GPUs.
-   >
-   >    > 1. `TransformerConfig.cpu_offloading=True`
-   >    > 2. `TransformerConfig.cpu_offloading_weights=False`
-   >    > 3. `TransformerConfig.cpu_offloading_num_layers= <int:≤activation_offload_layers>`
-   >
-   > 3. Environment variable settings to avoid resource conflict between CPU memory offloading and network communication
-   >
-   >    > 1. `NCCL_NET_GDR_LEVEL=PHB # NCCL <=2.25`
-   >    > 2. `NCCL_NET_GDR_C2C=1     # NCCL >=2.26`
-   >
-   > 4. Optimization tips
-   >
-   >    > 1. Given the ratio between activation volume and computational operations, offloading all layer activations naively can become a performance bottleneck. Optimizing performance requires tuning the number of layers to offload while balancing it with recomputation.
-
-3. Weight memory-optimized BF16 training
-
-   > 1. In BF16 training, Megatron-Bridge optimizes memory usage by storing only the BF16 remainder of the master weight copies for the next optimizer update. This is possible because BF16 data can be represented using a subset of FP32 bits, allowing Megatron-Bridge to avoid redundant storage of the FP32 portion used for BF16 representation. This is default enabled when using precision-aware optimizer in Megatron Core.
-   >
-   >    > 1. `OptimizerConfig.use_precision_aware_optimizer=True`
-
-4. Common memory usage hikes from environment variable setting
-
-   > 1. The below environment variables will (1) avoid preserving the buffers for NCCL communication and (2) disable NVLSharp when not used. Both these options lower the GPU memory usage.
-   >
-   >    > 1. `TORCH_NCCL_AVOID_RECORD_STREAMS=1`
-   >    > 2. `NCCL_NVLS_ENABLE=0`
-   >
-   > 2. While not enabled by default, you can further reduce memory usage caused by segmentation penalties by setting the env var shown below.
-   >
-   >    > 1. `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`
-
-5. Keep parameters in FP8 at FP8 training
-
-   > 1. In FP8 training, after optimizer step execution, we can keep the parameters in FP8. Compared to the baseline that keeps the intermediate weight values in BF16, FP8 parameters lower memory usage and improve communication performance. The below knob enables keeping the parameters in FP8.
-   >
-   >    > 1. `MixedPrecisionConfig.fp8_param_gather=True`
-
-## Operator Fusion
-
-1. You can control specific fusion behaviors using the following configuration knobs:
-
-   > 1. `TransformerConfig.masked_softmax_fusion=true`
-   > 2. `GPTProvider.cross_entropy_loss_fusion=true`
-   > 3. `GPTProvider.gradient_accumulation_fusion=true`
-   > 4. `TransformerConfig.bias_activation_fusion=true`
-   > 5. `TransformerConfig.bias_dropout_fusion=true`
-   > 6. `TransformerConfig.apply_rope_fusion=true`
-
-2. Megatron-Bridge offers different Flash Attention options, which can be chosen through the model config:
-
-   > 1. Let Transformer Engine decide (default): `TransformerConfig.attention_backend=AttnBackend.auto`
-   > 2. FlashAttention2: `TransformerConfig.attention_backend=AttnBackend.flash`
-   > 3. cuDNN fused attention: `TransformerConfig.attention_backend=AttnBackend.fused`
-
-(long-sequence-train)=
-## Long Sequence Training
-
-1. Problem of long sequence training
-
-   > 1. Training with long sequence length can lead to memory overflow due to the huge memory cost of activations. The problem could be solved by recomputing activations in backward, but it can impose up to ~30% overheads in each training step. Context parallelism is a better solution which splits the sequence dimension across multiple GPUs, so that each GPU only computes and saves activations of a sequence chunk. In this way, memory overflow is addressed without introducing any redundant compute.
-
-2. CP to shard activation (knob)
-
-   > 1. `TransformerConfig.context_parallel_size=<int>`
-   >
-   >    > 1. Both TP and CP can reduce activation memory overheads. It's not wise to be biased to either of them. Communications of TP and CP are overlapped by GEMM and Attention respectively. Blindly enlarging their sizes can make some communications hard to overlap. It's recommended to sweep a combination of TP+CP configs. The optimal config is expected to make full use of all related compute and do best overlapping, thereby achieving best end-to-end performance.
-   >
-   > 2. `TransformerConfig.cp_comm_type=<str> or <list of str>`
-   >
-   >    > 1. Megatron-Core provides multiple implementation variants of CP and allows you to make choices based on your specific use cases by configuring "cp_comm_type". The configuration value can be `p2p`, `all_gather`, `a2a`, or `a2a+p2p`. These communication types are compatible with each other, so they can be flexibly interleaved between transformer layers. You only need to provide a list, where each element corresponds to a layer.
-   >    > 2. `p2p`: exchanges KV sequence chunks in ring-topology. The P2P communications can be fully overlapped.
-   >    > 3. `all_gather`: inserts an all-gather before attention to get a full sequence of KV. The all-gather is exposed, but it should not impose big overheads if GQA/MQA are used, as they have very few KV heads.
-   >    > 4. `a2a`: is an implementation of DeepSpeed Ulysses. A2A communications are added before and after the attention module to gather full sequence length and further scatter heads in CP domain. A2A cannot be overlapped.
-   >    > 5. `a2a+p2p`: is a middle ground between `a2a` and `p2p`. This is useful for cases of big CP sizes, where each sequence chunk is too short to overlap P2P communications. It first does A2A in partial CP groups to gather relatively longer sequence chunks, then applies P2P implementation to the gathered chunks. It also can be helpful for hierarchical CP communications, for example A2A and P2P happen in NVLink and IBLink domains respectively.
-   >    > 6. With small and medium CP size, `p2p` is the recommended configuration because communications can be fully overlapped; "all_gather" also should work fine with GQA/MQA. As for strongly-scaling a sequence length with big CP sizes, the short chunk length can barely overlap the `p2p` communications, so `a2a+p2p` ought to be the preferred choice. `a2a` could be adopted in some cases for its simplicity. However, CP size can be restricted with "a2a" because it requires the number of attention heads to be divisible by CP size. Restricted CP size will finally limit the sequence length that can be run.
-
-3. Activation recomputation (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow))
-
-4. Activation offloading to host memory (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow))
-
-## Sequence Packing for Performant Fine-Tuning
-
-1. Dataset preparation
-
-   > 1. Fine-tuning datasets with shorter sequences of variable length can be packed into longer sequences, up to a set maximum length, for best efficiency.
-
-2. To use this feature, the microbatch size must be set to 1. In place of increasing the micro batch size, the maximum sequence length can be increased, which will effectively increase the number of individual sequences per packed sequence.
-
-3. Enabled with:
-
-   > 1. `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size=<max sequence length>`
-   > 2. `TrainingConfig.micro_batch_size=1`
-
-4. Performance benefits also include:
-
-   > 1. Inconsistent lengths between sequences in the fine-tuning dataset would reduce the computation efficiency. With a micro-batch size over 1, all sequences must be padded with empty tokens to the length of the longest one in the micro-batch. Similarly, some optimizations like CUDA graphs require uniform sequence lengths between micro-batches. Packed sequences are arranged so that the total number of tokens per packed sequence is as close to the maximum length as possible, making most processed tokens useful.
-   > 2. Likewise, when using data parallel, variance in time needed to process different batches can result in all batches needing to wait for the longest to finish-- and this variance is reduced with packed sequence.
-
-## GPU Core Clock Optimization
-
-1. Increase the clock ratio of GPU core over off-chip memory system
-
-   > 1. NVIDIA GPUs support a CPU core clock boost mode, which increases the core clock rate by reducing the off-chip memory clock rate. This is particularly beneficial for LLMs, which are typically compute throughput-bound.
-   >
-   >    > 1. `sudo nvidia-smi boost-slider --vboost 1 <run commandline>`
-
-## Profiling Options for Analysis-based Performance Tuning
-
-1. Nsight system profile
-
-   > 1. Megatron-Bridge provides an interface to enable the NVIDIA Nsight Systems profiler, which displays the GPU execution trace of all CUDA streams. You can check whether communication kernels overlap with computation kernels and adjust resource allocation to balance communication and computation. The Nsight Systems profile can be enabled using ProfilingConfig, as shown below.
-   > 2. `ProfilingConfig(use_nsys_profiler=True, profile_start_step=<int>, profile_end_step=<int>, profile_ranks=<[0,...]>)`
-
-2. Memory snapshot
-
-   > 1. Megatron-Bridge provides an interface to extract the memory snapshot that shows the memory allocation bytes, the allocation lifespan, and the function call stack. Extracting the memory snapshot can be enabled by ProfilingConfig as shown below.
-   > 2. `ProfilingConfig(record_memory_history=True, memory_snapshot_path=</path/to/store/the/output/file, profile_ranks=<[0,...]>)`
-
-## DeepEP: Common Issues and Solutions
-
-DeepEP is a communication library optimized for Mixture-of-Experts (MoE) all-to-all operations. When using DeepEP for cross-node Expert Parallelism (EP), there are several common issues related to network transport and GPU-NIC affinity that can significantly impact performance.
-
-> Note: DeepEP is best optimized for NVL8 systems such as the DGX-B200 NVL8 or DGX-H200 NVL8. For GB200 NVL72 rack-scale systems, where 72 GPUs are interconnected within the same NVLINK domain, we recommend using [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) instead of DeepEP. HybridEP is maintained by NVIDIA and is specifically optimized for NVL72 rack scale systems. It is also integrated into the Megatron-core [fused all-to-all module](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.transformer.moe.fused_a2a.html) as an alternative backend under the `flex` token dispatcher.
->
-> Learn more about GB200 MoE training best practices [here](https://github.com/NVIDIA/Megatron-LM/blob/dev/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md).
-
-### 1. Why is my DeepEP not working
-
-1. What is IBGDA and why is it a problem
-
-   DeepEP achieves optimal cross-node communication performance using InfiniBand GPU Direct Async (IBGDA), which is supported by ConnectX NICs in both InfiniBand and RoCEv2 modes. However, IBGDA is not always enabled by default—it often requires cluster administrators to actively configure the system and enable GPU Direct RDMA support in the InfiniBand (or RoCEv2) fabric. If this configuration step is skipped or unsupported in the cluster environment, IBGDA may be unavailable, which can prevent DeepEP inter-node EP capability from functioning.
-
-1. Network Transport: IBGDA vs. IBRC
-
-   > 1. IBGDA (InfiniBand GPU Direct Async) requires cluster administrators to enable GPU Direct RDMA and configure the InfiniBand subsystem. Many clusters do not have IBGDA enabled by default.
-   > 2. The official DeepEP main branch has removed support for IBRC (InfiniBand Reliable Connection), which previously served as a fallback mechanism. With IBRC, a CPU proxy thread will assist in processing the EP communication, which might have performance degradation compared to IBGDA, but we find such performance degradation doesn't overshadow the benefit of enabling wideEP in production training.
-
-2. Solution: NVSHMEM 3.5 with Automatic Transport Fallback
-
-   > 1. NVSHMEM 3.5 introduces improved auto-fallback support for cross-node communication under various network configurations. It can automatically select the best available transport (IBGDA, IBRC, or other supported mechanisms) based on cluster capabilities.
-   > 2. To benefit from NVSHMEM’s auto-fallback in DeepEP:
-   >    - Download the [official NVSHMEM 3.5.19-1 release](https://github.com/NVIDIA/nvshmem/releases/tag/v3.5.19-1). You can also choose to compile it from source in your container environment; we provide such examples later in this guide.
-   >    - Switch to the [DeepEP branch with native NVSHMEM API integration](https://github.com/seth-howell/DeepEP/tree/nvshmem_native_apis). This branch enables automatic use of NVSHMEM’s fallback mechanisms without requiring any manual code modifications.
-
-### 2. GPU-NIC Affinity and Bandwidth Contention
-
-A common cause of poor DeepEP performance is incorrect GPU-to-NIC (Network Interface Card) affinity, where multiple GPUs compete for bandwidth on a single NIC. As noted in [DeepEP PR #466](https://github.com/deepseek-ai/DeepEP/pull/466), cross-node EP performance may degrade if multiple GPUs use the same NIC, due to certain GPU-NIC affinity in some clusters. This PR provides a solution by supporting the environment variable `DEEP_EP_DEVICE_TO_HCA_MAPPING` to specify GPU-to-NIC mappings so that each GPU is automatically bound to the optimal NIC for maximum DeepEP throughput.
-
-With this PR's solution, we need the following environment variables to map GPUs to NICs correctly. First, you need to find out the names of the NICs by running `ibstat`. In our example, we found the following for one RoCEv2 DGX-B200 cluster:
-```
-> ibstat | grep ^CA
-CA 'rocep145s0'
-CA 'rocep146s0'
-CA 'rocep152s0'
-CA 'rocep153s0'
-CA 'rocep198s0'
-CA 'rocep199s0'
-CA 'rocep205s0'
-CA 'rocep206s0'
-```
-
-Use the following environment variables to map GPUs to NICs. Note that `0:rocep145s0:1` is formatted as `<CUDA_device_id>:<NIC_name>:<port>` so that each GPU will only be mapped to one dedicated NIC.
-```bash
-export NVSHMEM_ENABLE_NIC_PE_MAPPING=1
-export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1"
-```
-
-### 3. Build DeepEP
-
-In this section, we provide a reference Dockerfile that shows how to build NVSHMEM 3.5 and the customized DeepEP into your container environment.
-
-Note that the following example is provided for DGX-B200 NVL8 systems, but similar ideas apply to Hopper generation as well—just change the Dockerfile accordingly. For example, you just need to change the compile target for SM90.
-
-Key points:
-
-- NVSHMEM source: https://github.com/NVIDIA/nvshmem/tree/v3.5.19-1
-- DeepEP branch that we cherry-picked with all the fixes above: https://github.com/zhongbozhu/DeepEP/tree/nvshmem_deepep_gcp
-- Example training container template for DGX-B200: https://github.com/yanring/Megatron-MoE-ModelZoo/blob/main/dockers/B200.Dockerfile 
-
-**Dockerfile**
-```bash
-FROM nvcr.io/nvidia/pytorch:25.11-py3 as base
-
-# Other dependencie you may want
-...
-
-# Dependency of IBGDA
-RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
-
-# Clone DeepEP customized version 
-WORKDIR /home/dpsk_a2a
-RUN git clone https://github.com/zhongbozhu/DeepEP.git ./deepep
-RUN cd ./deepep && git checkout nvshmem_deepep_gcp && cd /home/dpsk_a2a
-
-# Clone NVSHMEM 3.5 https://github.com/NVIDIA/nvshmem
-RUN git clone --branch v3.5.19-1 https://github.com/NVIDIA/nvshmem.git ./deepep-nvshmem
-RUN cd ./deepep-nvshmem && git checkout v3.5.19-1 && cd /home/dpsk_a2a
-
-# Build nvshmem from source
-# You can also download the pre-built binary, and skip the following 
-RUN apt-get update && \
-    DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        clang \
-        llvm-dev \
-        libclang-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /home/dpsk_a2a/deepep-nvshmem
-RUN mkdir -p build && mkdir -p install && \
-    cmake -S . -B build \
-    -DCMAKE_INSTALL_PREFIX=/home/dpsk_a2a/deepep-nvshmem/install \
-    -DCUDA_HOME=/usr/local/cuda \
-    -DMPI_HOME=/opt/hpcx/ompi \
-    -DMPI_C_COMPILER=/opt/hpcx/ompi/bin/mpicc \
-    -DMPI_CXX_COMPILER=/opt/hpcx/ompi/bin/mpicxx \
-    -DNVSHMEM_MPI_SUPPORT=OFF \
-    -DNVSHMEM_IBRC_SUPPORT=ON \
-    -DNVSHMEM_IBGDA_SUPPORT=ON \
-    -DNVSHMEM_IBDEVX_SUPPORT=OFF \
-    -DNVSHMEM_UCX_SUPPORT=OFF \
-    -DNVSHMEM_SHMEM_SUPPORT=OFF \
-    -DNVSHMEM_PMIX_SUPPORT=OFF \
-    -DNVSHMEM_USE_NCCL=OFF \
-    -DNVSHMEM_USE_GDRCOPY=ON \
-    -DGDRCOPY_HOME=/usr \
-    -DNVSHMEM_USE_MLX5DV=ON \
-    -DNVSHMEM_BUILD_TESTS=ON \
-    -DNVSHMEM_BUILD_EXAMPLES=ON \
-    -DNVSHMEM_BUILD_PYTHON_LIB=OFF \
-    -DNVSHMEM_BUILD_BITCODE_LIBRARY=OFF \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_CUDA_ARCHITECTURES="100" && \
-    cmake --build build -j && \
-    cmake --install build
-
-ENV NVSHMEM_DIR=/home/dpsk_a2a/deepep-nvshmem/install
-ENV LD_LIBRARY_PATH=${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH
-ENV PATH=${NVSHMEM_DIR}/bin:$PATH
-
-## Build deepep
-WORKDIR /home/dpsk_a2a/deepep
-ENV TORCH_CUDA_ARCH_LIST="10.0"
-ENV PIP_NO_BUILD_ISOLATION=1
-ENV CPATH=${CUDA_HOME}/include/cccl:$CPATH
-RUN pip install --no-build-isolation .
-
-```
-
-DeepEP provides `test_internode.py` to test and benchmark cross-node EP communication. In our experiment, when using 4 nodes of DGX-B200 (i.e., EP32), the achieved throughput for cross-EP is about 50 GB/s with IBRC. We provide an example SLURM script below for running such a test with DeepEP.
-
-In another experiment on the same cluster, with IBGDA enabled by the cluster admin, we observed approximately 10% higher inter-node performance—roughly 55 GB/s. To enable IBGDA, you need to set the environment variable `export NVSHMEM_IB_ENABLE_IBGDA=true`; there is no need to change the software version or container, because with the software provided above, both modes will work.
-
-```bash
-srun --account=<your_account> -N 4 -p batch --time 30 \
-     --ntasks-per-node=1 --gpus-per-node=8 \
-     --no-container-mount-home --container-mounts "/lustre:/lustre" \
-     --container-image <your_container_path> \
-     --mpi=none --export=ALL \
-     bash -lc '
-set -eo pipefail 
-
-# Env Var for GPU-NIC mapping
-export NVSHMEM_ENABLE_NIC_PE_MAPPING=1
-export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1"
-
-
-# 1) Expand SLURM_JOB_NODELIST and grab the first hostname
-headnode=$(python - <<PY
-import os, re
-nl = os.environ.get("SLURM_JOB_NODELIST", "") or os.environ.get("SLURM_NODELIST", "")
-if not nl:
-    print(""); raise SystemExit(0)
-m = re.match(r"^([^-\\[]+)-(\\[(.+)\\]|(\\d+))$", nl)
-if not m:
-    # no bracket/range, just print it as-is
-    print(nl); raise SystemExit(0)
-prefix = m.group(1)
-br_or_num = m.group(3) or m.group(4)
-candidates = []
-for part in br_or_num.split(","):
-    part = part.strip()
-    if "-" in part:
-        a,b = part.split("-",1)
-        # preserve zero padding
-        width = max(len(a), len(b))
-        start, end = int(a), int(b)
-        candidates.append(f"{prefix}-{start:0{width}d}")
-    else:
-        candidates.append(f"{prefix}-{part}")
-print(sorted(candidates)[0])
-PY
-)
-
-if [[ -z "$headnode" ]]; then
-  echo "Could not determine master host from SLURM_JOB_NODELIST"; exit 1
-fi
-
-# 2) Resolve to an IP that both nodes can reach (fallback to the hostname)
-if command -v getent >/dev/null 2>&1; then
-  master_ip=$(getent ahostsv4 "$headnode" | awk "{print \$1; exit}")
-else
-  master_ip=""
-fi
-MASTER_ADDR="${master_ip:-$headnode}"
-
-# 3) Export rendezvous env that matches test_internode.py expectations
-export MASTER_ADDR
-export MASTER_PORT=${MASTER_PORT:-29500}
-export WORLD_SIZE=${SLURM_NNODES:-2}   # number of nodes
-export RANK=${SLURM_NODEID:-0}         # 0..N-1 per node
-
-export OMP_NUM_THREADS=1
-python -u /home/dpsk_a2a/deepep/tests/test_internode.py
-'
-
-```
-
-
-
-
-
-
-
-
-
-
-## Index - List of Tuning Knobs
-
-- `CommOverlapConfig.tp_comm_overlap`
-- `CommOverlapConfig.tp_comm_overlap_cfg`
-- `CUDA_DEVICE_MAX_CONNECTIONS`
-- `TrainingConfig.manual_gc_interval`
-- `MixedPrecisionConfig.fp8_param`
-- `ProfilingConfig`
-- `NCCL_NET_GDR_C2C`
-- `NCCL_NET_GDR_LEVEL`
-- `NCCL_NVLS_ENABLE`
-- `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives`
-- `TransformerConfig.attention_backend`
-- `AttnBackend`
-- `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives`
-- `PYTORCH_CUDA_ALLOC_CONF`
-- `TrainingConfig.micro_batch_size`
-- `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size`
-- `TransformerConfig.apply_rope_fusion`
-- `TransformerConfig.bias_activation_fusion`
-- `TransformerConfig.bias_dropout_fusion`
-- `TransformerConfig.cp_comm_type`
-- `TransformerConfig.cpu_offloading`
-- `TransformerConfig.cpu_offloading_num_layers`
-- `TransformerConfig.cpu_offloading_weights`
-- `GPTProvider.cross_entropy_loss_fusion`
-- `TransformerConfig.cuda_graph_impl` / `cuda_graph_scope` (see [CUDA Graphs](training/cuda-graphs.md))
-- `MixedPrecisionConfig.fp8_param_gather`
-- `GPTProvider.gradient_accumulation_fusion`
-- `TransformerConfig.masked_softmax_fusion`
-- `TransformerConfig.recompute_granuality`
-- `TransformerConfig.recompute_method`
-- `TransformerConfig.recompute_num_layers`
-- `OptimizerConfig.use_precision_aware_optimizer`
-- `GPTProvider.account_for_embedding_in_pipeline_split`
-- `GPTProvider.account_for_loss_in_pipeline_split`
-- `TransformerConfig.context_parallel_size`
-- `DistributedDataParallelConfig.align_param_gather`
-- `DistributedDataParallelConfig.bucket_size`
-- `DistributedDataParallelConfig.bucket_size`
-- `DistributedDataParallelConfig.data_parallel_sharding_strategy`
-- `DistributedDataParallelConfig.grad_reduce_in_fp32`
-- `DistributedDataParallelConfig.num_distributed_optimizer_instances`
-- `DistributedDataParallelConfig.overlap_grad_reduce`
-- `DistributedDataParallelConfig.overlap_param_gather`
-- `T5ModelProvider.encoder_pipeline_model_parallel_size`
-- `T5ModelProvider.encoder_tensor_model_parallel_size`
-- `TransformerConfig.expert_model_parallel_size=<int>`
-- `TransformerConfig.expert_tensor_parallel_size=<int>`
-- `TransformerConfig.moe_grouped_gemm`
-- `DistributedInitConfig.use_torch_fsdp2`
-- `TransformerConfig.pipeline_model_parallel_size`
-- `TransformerConfig.tensor_model_parallel_size`
-- `TransformerConfig.virtual_pipeline_model_parallel_size`
-- `OptimizerConfig.use_distributed_optimizer`
-- `TORCH_NCCL_AVOID_RECORD_STREAMS`
-- `TPOverlapCfg.cga_size`
-- `TPOverlapCfg.fp8_buf`
-- `TPOverlapCfg.num_sm`
-- `TPOverlapCfg.num_split`
-<!-- - `garbageCollectionCallback.gc_interval_val` -->
-<!-- - `NsysPlugin` -->
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/communication-overlap.md
-```md
-# Communication Overlap
-
-Communication overlap reduces exposed communication cost in distributed training
-by hiding collectives or point-to-point transfers under useful compute.
-
-This page is the stable guide for what communication overlap is, when it tends
-to help, and which boundaries are durable across Megatron Bridge. For exact
-knobs, code anchors, and verification commands, see:
-
-- `skills/perf-techniques/tp-dp-comm-overlap/SKILL.md`
-- `skills/perf-techniques/expert-parallel-overlap/SKILL.md`
-
-## What It Is
-
-In Bridge, communication overlap is a family of related techniques rather than a
-single switch:
-
-| Mode | What gets hidden | Main gate |
-|---|---|---|
-| DP | gradient reduce-scatter and parameter all-gather | distributed-optimizer overlap path |
-| TP | tensor-parallel collectives under layer compute | `CommOverlapConfig.tp_comm_overlap` plus sequence parallelism |
-| PP | pipeline send/recv work under schedule execution | pipeline schedule and virtual pipeline layout |
-| CP | context-parallel communication inside CP execution paths | CP implementation choice |
-| EP | MoE token dispatch/combine communication under expert compute | `overlap_moe_expert_parallel_comm` |
-
-These paths share the same goal, but they do not share the same enablement
-rules, evidence level, or failure modes.
-
-## What Problem It Solves
-
-Distributed training often becomes communication-bound before it becomes
-compute-bound. Once TP, DP, PP, CP, or EP traffic is visible on the critical
-path, adding more GPUs may raise communication time faster than it raises useful
-compute.
-
-Communication overlap addresses that by moving communication earlier or later in
-the step so the same transfer can happen while some other part of the model is
-already doing useful work. It does not change the training objective. It tries
-to reduce idle time.
-
-## Impacted Training Dimensions
-
-| Dimension | Effect | Confidence | Why |
-|---|---|---|---|
-| `speed` | ~0-15% faster step time, mode-dependent | medium | The whole point is to hide communication time, but gain depends strongly on which overlap mode is active and whether communication is actually exposed. EP overlap measured flat to ~13% slower on small-EP Qwen3-30B-A3B, so gains are not guaranteed. |
-| `memory` | neutral (some modes add ~1-2 GB for buffers) | low | Overlap itself is usually not a primary memory technique, although some implementations (e.g., TP userbuffers) add buffer or scheduling constraints. |
-| `scale` | positive at higher parallelism degrees | medium | Overlap becomes more valuable as communication dominates larger distributed runs. |
-| `convergence` | no change expected | medium | The intent is to preserve the same training math, though schedule changes can alter floating-point accumulation order. |
-| `stability` | adds operational constraints | medium | More overlap usually means tighter requirements around schedule shape, precision, runtime versions, and feature combinations. |
-
-## When to Use It
-
-Enable communication overlap when all of the following are mostly true:
-
-- the distributed configuration already works correctly without overlap
-- communication is a meaningful part of step time
-- you are tuning throughput or utilization, not doing first bring-up
-- you can benchmark the specific overlap mode you plan to use
-
-As a rule of thumb:
-
-| Mode | Good first use case | Recommendation |
-|---|---|---|
-| DP | distributed optimizer on multi-GPU or multi-node training | Usually worth considering early once optimizer sharding is already chosen. |
-| TP | `TP >= 2` with sequence parallelism and TE-enabled path | Benchmark when TP collectives are visible in the profile. |
-| PP | interleaved pipeline schedules where p2p overhead is visible | Treat as schedule tuning, not a blanket PP default. |
-| CP | large-context runs already using CP | Follow the CP-specific guidance rather than treating it as a separate generic knob. |
-| EP | large-scale MoE with many micro-batches and inter-node A2A cost | Most promising at larger EP and with higher-latency dispatcher backends. |
-
-Measured repo evidence today is strongest for MoE EP overlap. On
-Qwen3-30B-A3B with EP=4 and `alltoall` on 2 H100 nodes, EP overlap is
-numerically safe at GBS=8 but provides no speedup, and it is about 13% slower
-at GBS=64. On Qwen3-Next-80B-A3B with EP=8 and `alltoall` on 8 nodes, the
-overlap variants are stable while the non-overlap baseline NaNs, but
-`delay_wgrad_compute` is still about 4.8% slower than overlap-only. That makes
-EP overlap correctness-backed in this repo, but not yet broadly speedup-backed.
-
-## When Not to Use It
-
-Avoid communication overlap when any of these are true:
-
-- you are still debugging a new distributed setup
-- the profile is compute-bound rather than communication-bound
-- the required companion feature is missing, such as sequence parallelism for TP
-- another feature already imposes conflicting runtime constraints
-- you have not benchmarked the exact model and parallelism shape
-
-For MoE EP overlap specifically, avoid treating it as a default when:
-
-- `EP <= 4` with `alltoall` on `<= 2` nodes
-- the run has very few pipeline micro-batches
-- `moe_shared_expert_overlap` must stay enabled
-- full recompute or recompute scheduling incompatible with EP overlap is required
-
-## Feature Interactions
-
-The most important interactions are:
-
-- DP overlap is tied to distributed-optimizer behavior rather than a fully independent tuning path.
-- TP overlap depends on sequence parallelism and the supported TE overlap path.
-- PP and EP overlap interact with virtual pipeline layout when `PP > 1`.
-- CP overlap should be reasoned about together with the chosen CP communication type.
-- EP overlap with DeepEP or HybridEP requires explicitly switching the dispatcher to `flex`.
-- EP overlap and `moe_shared_expert_overlap` are mutually exclusive.
-- CUDA graphs plus `delay_wgrad_compute` adds extra TE-version and graph-scope restrictions.
-- Launch-time environment tuning can conflict across overlap paths, especially TP or CP overlap versus DeepEP or HybridEP tuning.
-
-## Bridge Configuration
-
-Communication overlap is configured through `CommOverlapConfig` plus
-mode-specific model settings. There is no single universal toggle — DP, TP,
-PP, CP, and EP each have different prerequisites and should be enabled based
-on the actual bottleneck.
-
-For config examples and minimal runnable commands, see:
-
-- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md)
-- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md)
-
-## Expected Metric Changes
-
-| Metric | Expected Change | Conditions | Evidence |
-|---|---|---|---|
-| `step_time` | down | DP overlap with distributed optimizer on communication-heavy runs | expected |
-| `step_time` | down | TP overlap with `TP >= 2`, sequence parallelism, and supported TE path | expected |
-| `pipeline_idle_time` | down | interleaved PP where p2p cost is visible | expected |
-| `step_time` | flat | Qwen3-30B-A3B, EP=4, `alltoall`, 2 nodes, GBS=8 | measured: 822ms baseline vs 827ms overlap |
-| `step_time` | up | same model/config, GBS=64 | measured: 4889ms baseline vs 5538ms overlap |
-| `step_time` | up | Qwen3-Next-80B-A3B, EP=8, `alltoall`, 8 nodes, `delay_wgrad_compute=True` vs overlap-only | measured: 4912ms vs 4686ms |
-
-Do not assume one overlap win transfers automatically to another mode. The
-correct question is always "which communication path is exposed in this run?"
-
-## Common Failure Modes
-
-- TP overlap silently disables itself when sequence parallelism is off or `TP < 2`.
-- PP overlap expectations are wrong when the schedule is non-interleaved or VPP is missing.
-- EP overlap asserts when `PP > 1` but `virtual_pipeline_model_parallel_size` is unset.
-- EP overlap asserts when full recompute, recompute method, or shared-expert overlap stays enabled.
-- Setting `moe_flex_dispatcher_backend` alone does not activate DeepEP or HybridEP; the dispatcher must actually switch to `flex`.
-- Small-EP `alltoall` MoE runs can get slower because scheduling overhead is larger than the communication being hidden.
-
-## Related Docs
-
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/cuda-graphs.md](cuda-graphs.md)
-- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md)
-- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md)
-- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md)
-- [skills/perf-techniques/moe-comm-overlap/SKILL.md](../skills/perf-techniques/moe-comm-overlap/SKILL.md)
-- [skills/perf-techniques/moe-comm-overlap/card.yaml](../skills/perf-techniques/moe-comm-overlap/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/mixed-precision.md
-```md
-# Mixed Precision Training
-
-Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. Megatron Bridge supports FP16, BF16, and FP8 via Transformer Engine (TE) across most models through the {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` configuration.
-
-## Configuration Overview
-
-Mixed precision is configured in Megatron Bridge through the `mixed_precision` field in {py:class}`bridge.training.config.ConfigContainer`, which accepts either:
-- A string name referencing a predefined recipe (e.g., `"bf16_mixed"`)  
-- A {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` object for custom configurations
-
-The mixed precision configuration automatically updates the model, optimizer, and distributed data parallel settings with the appropriate precision parameters.
-
-## Half-Precision Training
-
-Megatron Bridge supports half-precision FP16 and BF16 computation training via Megatron Core and the distributed optimizer. This training recipe uses half-precision in all layer computation while keeping the model states (optimizer states and master parameters) in single-precision. To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer step.
-
-### Using Predefined Recipes
-
-The simplest way to enable mixed precision is using predefined recipe names:
-
-```python
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure with BF16 mixed precision
-config = ConfigContainer(
-    mixed_precision="bf16_mixed",
-    # ... other config parameters
-)
-
-# Configure with FP16 mixed precision  
-config = ConfigContainer(
-    mixed_precision="fp16_mixed",
-    # ... other config parameters
-)
-```
-
-### Custom Mixed Precision Configuration
-
-For more control, create a custom {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig`:
-
-```python
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-import torch
-
-# Custom BF16 configuration
-bf16_config = MixedPrecisionConfig(
-    bf16=True,
-    params_dtype=torch.bfloat16,
-    pipeline_dtype=torch.bfloat16,
-    autocast_enabled=False,
-    grad_reduce_in_fp32=True,
-)
-
-config = ConfigContainer(
-    mixed_precision=bf16_config,
-    # ... other config parameters
-)
-```
-
-## FP8 Training
-
-NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. Megatron Bridge uses the NVIDIA TransformerEngine (TE) to leverage speedups from FP8. For a more detailed overview, refer to the [TE documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html), specifically the FP8 format and recipe.
-
-### FP8 Configuration Parameters
-
-The {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` provides several FP8-specific parameters:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `fp8` | `Optional[str]` | `None` | FP8 format: `"hybrid"` (E4M3 for activations/weights, E5M2 for gradients) or `"e4m3"` |
-| `fp8_recipe` | `str` | `"tensorwise"` | FP8 recipe type: `"tensorwise"`, `"delayed"`, `"blockwise"`, `"mxfp8"` (Blackwell only) |
-| `first_last_layers_bf16` | `bool` | `False` | If True, retains first and last N TransformerBlocks in BF16 as opposed to FP8 |
-| `num_layers_at_start_in_bf16` | `int` | `0` | Number of layers at the start of the model to keep in BF16 precision when `first_last_layers_bf16` is True |
-| `num_layers_at_end_in_bf16` | `int` | `0` | Number of layers at the end of the model to keep in BF16 precision when `first_last_layers_bf16` is True |
-| `fp8_margin` | `int` | `0` | Scaling factor shift by $2^{margin}$ |
-| `fp8_amax_history_len` | `int` | `1` | Window size for amax history storage |
-| `fp8_amax_compute_algo` | `str` | `"most_recent"` | Amax selection algorithm: `"max"` or `"most_recent"` |
-| `fp8_param` | `Optional[bool]` | `None` | Store module-level parameters in FP8 |
-| `fp8_param_gather` | `bool` | `False` | Enable FP8 parameter gathering |
-
-### FP8 Recipe Examples
-
-Use any of the predefined FP8 recipe names with the `mixed_precision` parameter:
-
-```python
-# Example: BF16 with FP8 current scaling
-config = ConfigContainer(
-    mixed_precision="bf16_with_fp8_current_scaling_mixed",
-    # ... other config parameters
-)
-```
-
-## Available Mixed Precision Recipes
-
-Megatron Bridge provides numerous predefined mixed precision recipes for different use cases. You can use the {py:func}`~megatron.bridge.training.mixed_precision.get_mixed_precision_config` utility function to convert from a string shortname to a class instance. For the complete list of available recipes and their specific configurations, see the {py:mod}`megatron.bridge.training.mixed_precision` module.
-
-
-### Custom FP8 Configuration
-
-For advanced use cases, create a custom FP8 configuration:
-
-```python
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-import torch
-
-# Custom FP8 configuration
-fp8_config = MixedPrecisionConfig(
-    bf16=True,
-    params_dtype=torch.bfloat16,
-    pipeline_dtype=torch.bfloat16,
-    fp8="hybrid",
-    fp8_recipe="tensorwise", 
-    fp8_margin=0,
-    fp8_amax_history_len=1024,
-    fp8_amax_compute_algo="max",
-    fp8_param_gather=True,
-)
-
-config = ConfigContainer(
-    mixed_precision=fp8_config,
-    # ... other config parameters
-)
-```
-
-### Registering Custom Mixed Precision Recipes
-
-You can also register your own custom mixed precision configurations to work with the shortname system. Use the {py:func}`~megatron.bridge.training.mixed_precision.register` decorator on a function that returns a `MixedPrecisionConfig` object:
-
-```python
-from megatron.bridge.training.mixed_precision import register, MixedPrecisionConfig
-
-@register
-def my_custom_fp8_recipe() -> MixedPrecisionConfig:
-    """Custom FP8 recipe with specific settings for my use case."""
-    return MixedPrecisionConfig(
-        bf16=True,
-        fp8="hybrid",
-        fp8_recipe="tensorwise",
-        fp8_param_gather=True,
-        # ... other custom settings
-    )
-
-# Now you can use it with the utility function
-config = get_mixed_precision_config("my_custom_fp8_recipe")
-```
-
-Common recipe categories include:
-- **Half-precision recipes**: Basic BF16 and FP16 mixed precision
-- **FP8 recipes**: Various FP8 scaling strategies (delayed, current, subchannel)
-- **Architecture-specific recipes**: Optimized for specific GPU architectures (Hopper, Blackwell)
-- **Model-specific recipes**: Tuned for particular model families
-
-## Configuration Synchronization
-
-When a mixed precision configuration is provided, it automatically synchronizes precision-related settings across the model, optimizer, and distributed data parallel (DDP) configurations. This ensures consistent precision behavior throughout the training pipeline.
-
-**Important**: Mixed precision settings will override any conflicting precision parameters that may have been set directly on the model, optimizer, or DDP configurations. The mixed precision configuration acts as the authoritative source for all precision-related parameters.
-
-For example, if you specify both:
-```python
-# This will be overridden
-model_config.bf16 = False
-optimizer_config.bf16 = False
-
-config = ConfigContainer(
-    model=model_config,
-    optimizer=optimizer_config,
-    mixed_precision="bf16_mixed",  # This takes precedence during training
-    # ... other configs
-)
-```
-
-The mixed precision configuration will set `bf16=True` on both the model and optimizer configs, overriding the explicitly set `False` values. This synchronization prevents configuration mismatches that could lead to training issues.
-
-## Performance Considerations
-
-- **FP8 recipes are experimental** and convergence has not been fully validated for all models
-- **BF16** is generally recommended over FP16 for better numerical stability
-- **FP8** provides the best performance on H100 GPUs but requires careful tuning
-- **MXFP8** recipes are only supported on Blackwell architecture GPUs
-- **Blockwise scaling** recipes are optimized for Hopper architecture GPUs
-
-## Resources
-
-- [Transformer Engine Documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html)
-- [Intro to FP8, floating point formats, and mixed precision training](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8)
-- [Performance optimizations](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html) that are natively supported in Megatron Bridge by enabling FP8 training with TE
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/activation-recomputation.md
-```md
-# Activation Recomputation
-
-The input activations of network layers are stored in device memory and are used to compute gradients during back-propagation. When training a LLM with a long sequence length or a large micro-batch size, these input activations can quickly saturate device memory. Checkpointing a few activations and recomputing the rest is a common technique to reduce device memory usage.
-
-Activation recomputation in Megatron Bridge is configured through the model provider's recomputation parameters, which are based on Megatron Core's `TransformerConfig`.
-
-## Transformer Layer Recomputation
-
-Megatron Bridge supports transformer layer recomputation, which checkpoints the input of each transformer layer and recomputes the activations for the remaining layers. This technique significantly reduces activation memory usage. However, it increases the per-transformer layer computation cost by 30% due to re-executing the entire layer's forward computation.
-
-Megatron Bridge also supports partial transformer layer recomputation, which is beneficial when recomputing a few transformer layers helps to reduce enough GPU memory for the model to fit. This approach avoids the need to recompute the rest of the layers.
-
-### Configuration
-
-Transformer layer recomputation is configured through the model provider's recomputation parameters:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Full recomputation - recompute all layers
-model_config = GPTModelProvider(
-    recompute_granularity="full",  # Enable full layer recomputation
-    recompute_method="uniform",    # Uniform distribution across layers
-    recompute_num_layers=4,        # Number of layers per recomputation block
-    # ... other model parameters
-)
-```
-
-### Recomputation Methods
-
-#### Block Method
-Recomputes a specific number of transformer layers per pipeline stage:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="full",
-    recompute_method="block",      # Block-wise recomputation
-    recompute_num_layers=4,        # Recompute 4 layers per pipeline stage
-)
-```
-
-#### Uniform Method
-Uniformly divides the total number of transformer layers and recomputes input activations for each divided chunk:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="full",
-    recompute_method="uniform",    # Uniform distribution
-    recompute_num_layers=8,        # Number of layers per recomputation block
-)
-```
-
-### Pipeline Parallelism Considerations
-
-When training with pipeline parallelism:
-- `recompute_num_layers` indicates the layers per pipeline stage
-- When using virtual pipelining, `recompute_num_layers` specifies the number of layers per virtual pipeline stage
-- The framework automatically handles recomputation coordination across pipeline stages
-
-![Activation Recomputation Methods](images/activation-recomputation-example-1.jpg)
-*Figure 1: Scheme of uniform and block checkpointing method (full checkpointing granularity)*
-
-## Self-attention Recomputation
-
-Megatron Bridge supports selective self-attention recomputation that checkpoints the inputs of each self-attention block and recomputes the intermediate input activations. This cost-efficient method achieves high memory savings with minimal recomputation cost.
-
-The intermediate layers of the self-attention block account for the majority of the activation memory because the input sizes of softmax, dropout, and QKV dot-product attention layers have memory complexity proportional to the sequence length squared. However, their recomputation cost is relatively smaller than other linear projection layers that scale with the hidden size squared.
-
-![Activation Recomputation Granularity](images/activation-recomputation-example-2.jpg)
-*Figure 2: Scheme of full and selective checkpointing granularity*
-
-### Configuration
-
-Self-attention recomputation is enabled using selective granularity:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-model_config = GPTModelProvider(
-    recompute_granularity="selective",  # Enable selective recomputation
-    recompute_modules=["core_attn"],    # Recompute attention modules (default)
-    # ... other model parameters
-)
-```
-
-### Recomputation Modules
-
-Megatron Bridge supports selective recomputation for various modules:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="selective",
-    recompute_modules=[
-        "core_attn",      # Core attention computation (default)
-        "mlp",            # MLP layers
-        "layernorm",      # Layer normalization
-        "moe",            # Mixture of Experts layers
-        "moe_act",        # MoE activation functions
-        "shared_experts", # Shared expert layers
-        "mla_up_proj",    # Multi-Latent Attention up projection
-    ],
-)
-```
-
-### Flash Attention Integration
-
-Self-attention recomputation is automatically enabled when using Flash Attention through Transformer Engine. Flash Attention inherently provides memory efficiency by recomputing attention scores rather than storing them, making additional explicit recomputation often unnecessary.
-
-## Advanced Recomputation Configuration
-
-### Distributed Activation Checkpointing
-
-For models using model parallelism, you can distribute saved activations across the model parallel group:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="selective",
-    distribute_saved_activations=True,  # Distribute across model parallel group
-    # Note: Cannot be used with sequence_parallel=True
-)
-```
-
-### Memory vs Computation Trade-offs
-
-Different recomputation strategies offer different memory-computation trade-offs:
-
-- **Selective recomputation**: Provides high memory savings with minimal recomputation cost by targeting memory-intensive operations like attention
-- **Full recomputation**: Significantly reduces activation memory usage but increases per-transformer layer computation cost by approximately 30%
-- **No recomputation**: Preserves all activations in memory, requiring more GPU memory but no additional computation
-
-### MoE-Specific Recomputation
-
-For Mixture of Experts models, specialized recomputation options are available:
-
-```python
-model_config = GPTModelProvider(
-    # MoE configuration
-    num_moe_experts=8,
-    expert_model_parallel_size=2,
-    
-    # MoE recomputation
-    recompute_granularity="selective",
-    recompute_modules=["moe", "moe_act"],  # Recompute MoE-specific modules
-)
-```
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/megatron-fsdp.md
-```md
-# Megatron FSDP
-
-Megatron FSDP is the practical fully sharded data parallel path in Megatron
-Bridge today. It shards parameters, gradients, and optimizer state across data
-parallel ranks, which can reduce model-state memory substantially compared with
-plain Distributed Data Parallel (DDP) or the distributed optimizer path.
-
-This page is the stable overview for what Megatron FSDP is, when to use it, and
-what constraints matter. For operational enablement, code anchors, and
-verification commands, see [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md).
-
-## What It Is
-
-Megatron FSDP is the Megatron-Core custom FSDP implementation exposed in Bridge
-through `use_megatron_fsdp`.
-
-Compared with other data-parallel strategies:
-
-| Feature | DDP | Distributed Optimizer | Megatron FSDP |
-|---|---|---|---|
-| Parameter Storage | Replicated | Replicated | Sharded |
-| Optimizer States | Replicated | Sharded | Sharded |
-| Gradient Communication | All-reduce | Reduce-scatter | Reduce-scatter |
-| Parameter Communication | None | All-gather (after update) | All-gather (on-demand) |
-| Memory Efficiency | Baseline | High | Highest |
-| Communication Overhead | Low | Medium | Medium-High |
-
-The practical consequence is that Megatron FSDP is most useful when model-state
-memory, rather than activation memory, is the main bottleneck.
-
-## When to Use It
-
-Megatron FSDP is a good fit when all of the following are true:
-
-- the model is too large for plain DDP or distributed optimizer
-- you want the strongest currently supported FSDP path in Bridge
-- you are willing to trade more communication for lower memory
-- you can adopt the required FSDP checkpoint format
-
-Prefer another path when:
-
-- DDP already fits comfortably and simplicity matters most
-- distributed optimizer gives enough memory relief without fully sharding
-- you are evaluating PyTorch FSDP2 for production use on this branch
-
-## Stable Requirements
-
-Megatron FSDP in Bridge requires:
-
-- `use_megatron_fsdp` to be enabled
-- checkpoint format `fsdp_dtensor`
-- standard rank initialization order
-
-The `fsdp_dtensor` format uses PyTorch DTensor and
-`torch.distributed.checkpoint` (DCP) to store sharded parameters and optimizer
-state. It is **not interchangeable** with `torch_dist` or `zarr` checkpoints —
-you cannot load an `fsdp_dtensor` checkpoint into a non-FSDP run or vice versa.
-
-`fsdp_dtensor` is compatible with 5D parallelism (TP + PP + DP + CP + EP).
-Because DCP stores DTensor placement metadata, checkpoints saved under one
-parallelism layout can be loaded under a different layout (e.g., change TP or PP
-size between runs) — DCP handles the shard remapping automatically. The one
-unsupported combination is `use_tp_pp_dp_mapping=True`, which uses an
-alternative rank-initialization order that conflicts with FSDP sharding.
-
-Important stable constraints:
-
-- `use_megatron_fsdp` and `use_torch_fsdp2` are mutually exclusive
-- `use_tp_pp_dp_mapping` is not supported with Megatron FSDP
-- legacy checkpoint formats such as `torch_dist` and `zarr` are not valid for
-  Megatron FSDP save/load
-
-When Megatron FSDP is enabled, Bridge also adjusts some settings
-automatically, including disabling `average_in_collective` and several
-buffer-reuse optimizations that do not match the FSDP path.
-
-## Compatibility and Caveats
-
-At the configuration level, Megatron FSDP is intended to work with:
-
-- tensor parallelism
-- pipeline parallelism
-- context parallelism
-- expert parallelism
-- BF16 or FP16 mixed precision
-
-However, not every combination has the same level of in-repo validation or
-performance evidence. Treat broad compatibility as code-supported first, not as
-fully benchmark-proven for every combination.
-
-Two practical caveats matter most:
-
-1. Public recipes may expose `use_megatron_fsdp` while still defaulting to a
-   non-FSDP checkpoint format. The checkpoint requirement is stable and
-   mandatory even when recipe ergonomics lag behind.
-2. FSDP reduces model-state memory, not activation memory. For long-sequence or
-   activation-bound workloads, other techniques such as context parallelism,
-   activation recomputation, or CPU offloading may still be needed.
-
-## Torch FSDP2 Status
-
-Megatron Bridge also exposes a PyTorch FSDP2 path via `use_torch_fsdp2`, but
-that path should still be treated as experimental on this branch.
-
-The stable recommendation today is:
-
-- use Megatron FSDP if you need an FSDP path in Bridge
-- do not treat FSDP2 as interchangeable with Megatron FSDP
-
-## Related Docs
-
-- [docs/training/checkpointing.md](checkpointing.md)
-- [docs/training/cpu-offloading.md](cpu-offloading.md)
-- [docs/performance-guide.md](../performance-guide.md)
-- [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md)
-- [skills/perf-techniques/megatron-fsdp/card.yaml](../skills/perf-techniques/megatron-fsdp/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/hybrid-context-parallel.md
-```md
-# Hybrid / Hierarchical Context Parallel
-
-This page covers the stable Bridge-facing meaning of hierarchical context
-parallelism, especially the `a2a+p2p` transport path and
-`hierarchical_context_parallel_sizes`.
-
-For operational setup, code anchors, and verification commands, see
-[skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md).
-
-## What It Is
-
-Context parallelism (CP) splits the input sequence across GPUs so each rank
-processes a chunk. The GPUs must communicate KV data during attention. There are
-several CP communication backends:
-
-| `cp_comm_type` | Mechanism | Async / Overlap | Constraint |
-|---|---|---|---|
-| `"p2p"` | Ring-exchange of KV chunks | Yes | None |
-| `"all_gather"` | All-gather full KV before attention | No | None |
-| `"a2a"` | All-to-all: scatter heads, gather full sequence (Ulysses-style) | N/A | **CP <= num_kv_heads** |
-| `"a2a+p2p"` | Hierarchical: a2a within inner group, p2p across outer group | Partial (p2p part) | Requires `hierarchical_context_parallel_sizes` |
-
-**HCP (`a2a+p2p`)** exists to scale CP beyond the KV head count by combining
-a2a (fast, head-parallel) on intra-node links with p2p (async,
-sequence-parallel) on inter-node links.
-
-It is important to separate this from the upstream boolean
-`hybrid_context_parallel`, which is a different feature for balancing packed or
-variable-length workloads. The two concepts should not be treated as
-interchangeable.
-
-### Why a2a is limited by KV heads
-
-a2a transposes the parallelism dimension: each rank trades its sequence chunk
-for a subset of attention heads. After the all-to-all, every rank has the
-**full sequence** but only `heads / CP` heads. This means:
-
-- `heads / CP` must be a positive integer.
-- The bottleneck is KV heads (not Q heads), because in GQA the KV heads are the
-  indivisible unit.
-- If the model has 8 KV heads, pure a2a supports at most CP=8.
-
-HCP breaks this limit by applying a2a only within a sub-group small enough to
-fit within the KV head count.
-
-## When to Use It
-
-**Use HCP when ALL of these are true:**
-
-1. You need CP larger than `num_kv_heads / TP` (pure a2a won't fit).
-2. You cannot (or don't want to) increase TP to shrink CP.
-3. Your cluster has a clear bandwidth hierarchy (e.g., NVLink intra-node >> IB
-   inter-node).
-
-**Prefer pure `a2a` when:**
-
-- You can adjust TP so that `CP <= num_kv_heads / TP`. This is simpler, avoids
-  the p2p overhead, and often yields the same throughput with better memory
-  headroom.
-
-**Prefer pure `p2p` when:**
-
-- You have very few KV heads or want maximum CP flexibility.
-- Your workload can hide the p2p latency behind compute (long sequences help).
-
-### Decision example
-
-Model: 8 KV heads. Cluster: 4 nodes x 8 GPUs. Goal: train 128K sequences.
-
-| Option | TP | CP | `cp_comm_type` | Notes |
-|---|---|---|---|---|
-| A | 1 | 16 | `a2a+p2p` with `[8,2]` | a2a intra-node (8 GPUs), p2p across 2 node-groups |
-| B | 2 | 4 | `a2a` | CP=4 <= 8 KV heads. Simpler. Often same throughput. |
-| C | 1 | 16 | `p2p` | Works but no a2a bandwidth benefit intra-node |
-
-In practice, **option B is usually preferred** -- benchmarks showed identical
-throughput to option A with more memory headroom.
-
-It should be treated as an advanced feature rather than a default recommendation.
-
-## Stable Bridge Limitation
-
-The most important Bridge-specific limitation is that hierarchical context
-parallelism is currently supported only on the MPU initialization path.
-
-In practice, that means:
-
-- `dist.use_decentralized_pg=False` is the supported Bridge path
-- the decentralized process-group path should not be assumed to materialize HCP
-  groups
-
-## Stable Constraints
-
-The durable constraints are:
-
-- `hierarchical_context_parallel_sizes` must match
-  `context_parallel_size` multiplicatively
-- the usual CP sequence-length divisibility rules still apply
-- Transformer Engine version support matters for `a2a+p2p`
-
-## Recommendation Level
-
-Use hierarchical context parallelism in Bridge only when you intentionally want
-that transport path and are prepared to validate execution-path details. It is
-not yet the kind of feature that should be presented as universally safe across
-all Bridge initialization modes.
-
-## Related Docs
-
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/communication-overlap.md](communication-overlap.md)
-- [skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md)
-- [skills/perf-techniques/hybrid-context-parallel/card.yaml](../skills/perf-techniques/hybrid-context-parallel/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/packed-sequences.md
-```md
-# Packed Sequences
-
-Packed sequences are a fine-tuning technique that reduces padding waste by
-concatenating multiple examples into one pack while preserving sequence
-boundaries for attention. In Megatron Bridge, this is primarily a supervised
-fine-tuning and PEFT optimization rather than a general pretraining feature.
-
-This page is the stable overview for what packed sequences are, when to use
-them, and which constraints are durable. For operational setup, code anchors,
-and verification commands, see [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md).
-
-## What It Is
-
-Fine-tuning datasets often contain examples with highly variable lengths. When
-those examples are batched conventionally, many tokens in each batch are just
-padding. Packed sequences reduce that waste by building longer packs from
-multiple examples and carrying boundary metadata into the attention path.
-
-In Bridge today, there are two distinct packing paths plus long-context
-enablement through context parallelism:
-
-| Path | Use case | Key config |
-|---|---|---|
-| Offline packed SFT | Text-only finetuning | `packed_sequence_specs` |
-| VLM in-batch packing | VLM finetuning | `pack_sequences_in_batch=True` |
-| Long-context (CP) | Pretrain / finetune at 16K-128K+ | `context_parallel_size > 1` |
-
-These are related but they are not the same knob. Offline packed SFT and VLM
-in-batch packing solve padding waste; long-context training primarily addresses
-activation memory and communication tradeoffs at larger sequence lengths.
-
-## When to Use It
-
-Packed sequences are a good fit when all of the following are true:
-
-- you are doing SFT, PEFT, or VLM finetuning (all three packing paths are
-  supported; see the path table above)
-- your examples have variable lengths and padding waste is significant
-- you can tolerate the micro-batch constraints of packed training
-
-Packed sequences are usually not the right answer when:
-
-- you are doing standard Megatron-style pretraining, which already concatenates
-  documents during sampling
-- you want long-context training in general, where context parallelism is often
-  the main technique
-- your model family or recipe explicitly opts out of packed-sequence support
-
-## Stable Constraints
-
-The durable constraints for packed sequences in Bridge are:
-
-- packed SFT requires `micro_batch_size == 1`
-- when context parallelism is used, sequence length must satisfy the standard
-  CP divisibility constraints
-- for fine-tuning with CP enabled, per-token loss behavior and reduction
-  settings matter
-- CUDA-graph-friendly packed metadata requires additional padding constraints
-
-Model-family support is not universal. Some families and recipe paths explicitly
-opt out of packed sequences or related packing modes.
-
-## Relationship to Long-Sequence Training
-
-Packed sequences and long-sequence training are often mentioned together because
-both affect sequence layout and memory behavior, but they solve different
-problems:
-
-- packed sequences mainly reduce padding waste in fine-tuning datasets
-- long-sequence training mainly addresses activation memory and communication
-  tradeoffs at larger sequence lengths
-
-For long-sequence training guidance, see:
-
-- `docs/performance-guide.md`
-- `docs/training/hybrid-context-parallel.md`
-
-## Practical Caveats
-
-The most stable caveats to remember are:
-
-1. Packed-sequence support is recipe- and model-family-specific.
-2. Fine-tuning sequence packing should not be assumed to work with every other
-   training feature.
-3. Packed sequences improve efficiency primarily by reducing padding waste, not
-   by replacing long-context parallelism or memory-planning techniques.
-
-## Related Docs
-
-- [docs/training/multi-token-prediction.md](multi-token-prediction.md)
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md)
-- [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md)
-- [skills/perf-techniques/sequence-packing/card.yaml](../skills/perf-techniques/sequence-packing/card.yaml)
-- [skills/perf-techniques/packed-sequences-long-context/SKILL.md](../skills/perf-techniques/packed-sequences-long-context/SKILL.md)
-- [skills/perf-techniques/packed-sequences-long-context/card.yaml](../skills/perf-techniques/packed-sequences-long-context/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/parallelism-strategies/SKILL.md
-```md
----
-name: parallelism-strategies
-description: Operational guide for choosing and combining parallelism strategies in Megatron Bridge, including sizing rules, hardware topology mapping, and combined parallelism configuration.
----
-
-# Parallelism Strategy Selection Skill
-
-For stable background on each parallelism type, see:
-
-- `docs/parallelisms.md`
-- `card.yaml` (co-located)
-
-## Decision by Model Size
-
-### Dense models
-
-| Model size | GPUs | Recommended starting point |
-|---|---|---|
-| < 1B | 1-8 | DP only |
-| 1-10B | 8-16 | TP=2-4 + DP |
-| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP |
-| 70-175B | 64-256 | TP=8 + PP=4-8 + DP |
-| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP |
-
-### MoE models
-
-MoE parallelism differs from dense models. Because only a fraction of
-parameters are active per token, TP can often stay at 1 or 2 — the active
-parameter shard already fits on a single GPU. EP is the primary scaling
-dimension, with PP handling cross-node layer distribution.
-
-| Model (total / active) | TP | PP | EP | Notes |
-|---|---|---|---|---|
-| OLMoE 7B / 1B | 1 | 1 | 8 | EP only, fits single node |
-| Moonlight 16B / 3B | 2 | 1 | 8 | small TP for shared layers |
-| DeepSeek-V2 236B / 21B | 1 | 4 | 32 | no TP at all |
-| GLM-4.5 Air 106B / 12B | 1 | 4 | 8 | no TP at all |
-| Qwen3 30B-A3B | 4 | 2 | 4 | |
-| GLM-4.5 355B / 32B | 2 | 8 | 16 | |
-| Qwen3 235B-A22B | 4 | 16 | 8 | CP=2 for pretrain |
-| DeepSeek-V3 671B / 37B | 2 | 16 | 64 | TP=2, not 8 |
-| Kimi-K2 1T | 2 | 16 | 32 | |
-
-Key patterns:
-
-- TP is sized by **active** params, not total params. A 671B MoE with
-  37B active needs far less TP than a 70B dense model.
-- EP scales with expert count. Common: EP = num_experts or
-  num_experts / experts_per_gpu.
-- PP handles depth. Large MoE models use PP=8-16 across nodes.
-- ETP (expert tensor parallelism) is rarely used. Llama 4 is an
-  exception (ETP=4).
-
-These are starting points, not hard rules. Always profile the first
-iteration to verify memory and communication.
-
-## Decision by Hardware Topology
-
-Single node with NVLink:
-
-```python
-cfg.model.tensor_model_parallel_size = 8
-```
-
-Multiple nodes with InfiniBand:
-
-```python
-cfg.model.tensor_model_parallel_size = 8
-cfg.model.pipeline_model_parallel_size = N
-```
-
-Limited network (Ethernet):
-
-```python
-cfg.model.tensor_model_parallel_size = 4
-cfg.model.pipeline_model_parallel_size = M
-```
-
-The stable rule is: keep TP within a single NVLink domain. Use PP or DP
-for cross-node scaling. TP across nodes is almost always a performance
-loss.
-
-## Decision by Sequence Length
-
-| Sequence length | Recommendation |
-|---|---|
-| < 2K | standard TP + PP + DP |
-| 2K-8K | add SP (`sequence_parallel=True`) |
-| 8K-32K | add CP=2 |
-| 32K+ | add CP=4-8, consider `a2a+p2p` for large CP |
-
-## Combined Parallelism Enablement
-
-3D parallelism (TP + PP + DP):
-
-```python
-cfg.model.tensor_model_parallel_size = 4
-cfg.model.pipeline_model_parallel_size = 4
-cfg.model.sequence_parallel = True
-```
-
-4D parallelism (TP + PP + CP + DP):
-
-```python
-cfg.model.tensor_model_parallel_size = 8
-cfg.model.pipeline_model_parallel_size = 8
-cfg.model.context_parallel_size = 2
-cfg.model.sequence_parallel = True
-```
-
-MoE with EP + PP (e.g. DeepSeek-V2 236B on 128 GPUs):
-
-```python
-cfg.model.tensor_model_parallel_size = 1
-cfg.model.pipeline_model_parallel_size = 4
-cfg.model.expert_model_parallel_size = 32
-cfg.model.sequence_parallel = False
-```
-
-MoE with small TP + PP + EP (e.g. DeepSeek-V3 671B on 256 GPUs):
-
-```python
-cfg.model.tensor_model_parallel_size = 2
-cfg.model.pipeline_model_parallel_size = 16
-cfg.model.expert_model_parallel_size = 64
-cfg.model.sequence_parallel = True
-```
-
-DP size is always implicit:
-
-```
-data_parallel_size = world_size / (TP * PP * CP)
-```
-
-## Memory Estimation
-
-Without parallelism (70B model, FP16):
-
-```
-parameters:       140 GB
-gradients:        140 GB
-optimizer states: 280 GB (Adam)
-activations:       48 GB (batch=1, seq=4K)
-total:            608 GB
-```
-
-With TP=4, PP=4, DP=4 (64 GPUs):
-
-```
-parameters:        8.75 GB per GPU
-gradients:         8.75 GB per GPU
-optimizer states: 17.50 GB per GPU
-activations:       3.00 GB per GPU
-total:           ~38    GB per GPU
-```
-
-## Code Anchors
-
-Parallelism dimensions set in model provider:
-
-```66:81:docs/parallelisms.md
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,
-    # ... other model parameters
-)
-```
-
-DP size calculation:
-
-```424:436:docs/parallelisms.md
-data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size)
-```
-
-Bridge initialization wires parallelism into process groups:
-
-```618:628:src/megatron/bridge/training/initialize.py
-parallel_state.initialize_model_parallel(
-    tensor_model_parallel_size=model_config.tensor_model_parallel_size,
-    pipeline_model_parallel_size=model_config.pipeline_model_parallel_size,
-    ...
-    context_parallel_size=model_config.context_parallel_size,
-    hierarchical_context_parallel_sizes=model_config.hierarchical_context_parallel_sizes,
-    expert_model_parallel_size=model_config.expert_model_parallel_size,
-    ...
-)
-```
-
-## Pitfalls
-
-1. TP across nodes destroys throughput. Always keep TP within a single
-   NVLink domain.
-
-2. PP without interleaving has large pipeline bubbles. Use
-   `virtual_pipeline_model_parallel_size` when possible.
-
-3. SP requires `tensor_model_parallel_size > 1`. Enabling SP alone
-   without TP is a config error.
-
-4. CP requires `seq_length % (2 * context_parallel_size) == 0`.
-
-5. EP is only for MoE models. Setting `expert_model_parallel_size` on a
-   dense model is a no-op or error.
-
-6. The model-size-to-parallelism table above is a starting heuristic.
-   Always profile the first iteration to check memory and communication.
-
-7. `CUDA_DEVICE_MAX_CONNECTIONS` and related env vars interact with
-   overlap settings. See `skills/perf-techniques/tp-dp-comm-overlap/SKILL.md`.
-
-## Verification
-
-Quick sanity check that combined parallelism initializes correctly using
-the smallest available recipe with overridden parallelism:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 uv run python -m torch.distributed.run --nproc_per_node=4 \
-  scripts/training/run_recipe.py \
-  --recipe llama32_1b_pretrain_config \
-  model.tensor_model_parallel_size=2 \
-  model.pipeline_model_parallel_size=2 \
-  model.sequence_parallel=True \
-  train.train_iters=3 train.global_batch_size=8 train.micro_batch_size=1 \
-  scheduler.lr_warmup_iters=0 \
-  validation.eval_iters=0 validation.eval_interval=0 \
-  checkpoint.save_interval=0 \
-  logger.log_interval=1
-```
-
-Success criteria:
-
-- exit code 0
-- finite loss at iteration 3 (e.g. `lm loss: 1.003808E+01`)
-- log shows TP=2 PP=2 DP=1 layout with 4 ranks
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/sequence-packing/SKILL.md
-```md
----
-name: sequence-packing
-description: Operational guide for enabling packed sequences and long-context config paths in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification.
----
-
-# Sequence Packing Skill
-
-For stable background and recommendation level, see:
-
-- `docs/training/packed-sequences.md`
-- `card.yaml` (co-located)
-
-## Enablement
-
-Offline packed SFT for LLM finetuning:
-
-```python
-from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-
-cfg.train.micro_batch_size = 1
-cfg.dataset.seq_length = 4096
-cfg.model.seq_length = 4096
-cfg.dataset.dataset_kwargs = {"pad_to_max_length": True}
-cfg.dataset.packed_sequence_specs = PackedSequenceSpecs(
-    packed_sequence_size=4096,
-    pad_seq_to_mult=1,
-)
-```
-
-If CP is enabled:
-
-```python
-cfg.model.context_parallel_size = 2
-cfg.model.calculate_per_token_loss = True
-cfg.ddp.average_in_collective = False
-cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2
-```
-
-If CUDA graphs are enabled for this packed path:
-
-```python
-cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
-cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
-```
-
-**Note:** `pad_cu_seqlens = True` also requires a metadata JSON file alongside
-the packed dataset (asserted in `src/megatron/bridge/data/datasets/sft.py`).
-Custom packed datasets that omit the metadata file will hit an assertion at
-dataset initialization.
-
-In-batch packing for VLM finetuning:
-
-```python
-cfg.dataset.pack_sequences_in_batch = True
-cfg.train.micro_batch_size = 2
-```
-
-Long-context baseline:
-
-```python
-cfg.model.seq_length = 16384
-cfg.dataset.seq_length = 16384
-cfg.model.context_parallel_size = 2
-```
-
-## Code Anchors
-
-LLM packed SFT config surface:
-
-```72:97:src/megatron/bridge/recipes/utils/finetune_utils.py
-if packed_sequence:
-    dataset_kwargs = {"pad_to_max_length": True}
-    packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult)
-else:
-    dataset_kwargs = {}
-    packed_sequence_specs = None
-```
-
-Bridge validation:
-
-```1617:1657:src/megatron/bridge/training/config.py
-if self.model.context_parallel_size > 1:
-    assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, ...
-    if isinstance(self.dataset, FinetuningDatasetConfig):
-        assert self.model.calculate_per_token_loss, ...
-        assert not self.ddp.average_in_collective, ...
-...
-if ... packed_sequence_size > 0 and self.train.micro_batch_size > 1:
-    raise ValueError(...)
-...
-if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1:
-    raise ValueError(...)
-```
-
-VLM in-batch runtime:
-
-```308:327:src/megatron/bridge/training/vlm_step.py
-if enable_packing:
-    ...
-    ) = pack_batch_sequences(
-        ...
-        pad_token_id=0,
-        pad_to_multiple_of=cp_size * 2 if cp_size > 1 else 1,
-    )
-```
-
-Packed THD runtime constraint:
-
-```61:64:src/megatron/bridge/training/gpt_step.py
-if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1:
-    raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)")
-```
-
-## Pitfalls
-
-1. Offline packed SFT and VLM in-batch packing are different features with opposite micro-batch rules.
-2. When CP is enabled, packed sequence lengths must respect `2 * context_parallel_size` divisibility.
-3. For finetuning with CP, `calculate_per_token_loss=True` and `ddp.average_in_collective=False` are required.
-4. `pad_cu_seqlens=True` also requires `pad_to_max_length=True`.
-5. Packing support is model-family-specific. `Qwen3-Next`, `GLM-4.5`, and `Qwen3.5-VL` contain explicit opt-outs in different paths.
-6. MTP finetuning is documented as incompatible with packed sequences.
-
-## Verification
-
-Use the checked-in unit coverage:
-
-```bash
-uv run python -m pytest tests/unit_tests/training/utils/test_packed_seq_utils.py -v && \
-uv run python -m pytest tests/unit_tests/training/test_config.py -k "packed_sequence or pack_sequences_in_batch or context_parallel_seq_length_divisibility or context_parallel_finetuning_validations" -v && \
-uv run python -m pytest tests/unit_tests/training/test_vlm_step.py -k "enable_packing" -v
-```
-
-Success criteria:
-
-- first command reports `8 passed`
-- second command reports `14 passed`
-- third command reports `2 passed`
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/tp-dp-comm-overlap/SKILL.md
-```md
----
-name: tp-dp-comm-overlap
-description: Operational guide for enabling TP, DP, and PP communication overlap in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification.
----
-
-# TP / DP / PP Communication Overlap Skill
-
-For stable background and recommendation level, see:
-
-- `docs/training/communication-overlap.md`
-
-## Enablement
-
-Minimal Bridge override:
-
-```python
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-
-cfg.model.tensor_model_parallel_size = 4
-cfg.model.sequence_parallel = True
-cfg.model.pipeline_model_parallel_size = 4
-cfg.model.virtual_pipeline_model_parallel_size = 2
-
-cfg.comm_overlap = CommOverlapConfig(
-    tp_comm_overlap=True,
-)
-
-cfg.ddp.use_distributed_optimizer = True
-cfg.ddp.overlap_grad_reduce = True
-cfg.ddp.overlap_param_gather = True
-```
-
-Optional TP preset:
-
-```python
-from megatron.bridge.training.comm_overlap import userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048
-
-cfg.comm_overlap.tp_comm_overlap_cfg = userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048
-```
-
-Precision knobs belong to mixed precision:
-
-```python
-cfg.mixed_precision.grad_reduce_in_fp32 = False
-cfg.mixed_precision.fp8_param_gather = False
-```
-
-## Code Anchors
-
-Bridge overlap gating:
-
-```439:449:src/megatron/bridge/training/comm_overlap.py
-if self.user_comm_overlap_cfg.tp_comm_overlap is True:
-    if model_cfg.tensor_model_parallel_size < 2:
-        ...
-    elif not model_cfg.sequence_parallel:
-        ...
-    elif not HAVE_TE:
-        ...
-```
-
-PP overlap selection:
-
-```451:458:src/megatron/bridge/training/comm_overlap.py
-if model_cfg.pipeline_model_parallel_size > 1:
-    if vp_size > 1:
-        comm_overlap_cfg.overlap_p2p_comm = True
-        comm_overlap_cfg.batch_p2p_comm = False
-    else:
-        comm_overlap_cfg.overlap_p2p_comm = False
-        comm_overlap_cfg.batch_p2p_comm = True
-```
-
-DP overlap defaults:
-
-```572:579:src/megatron/bridge/training/comm_overlap.py
-if self.data_parallel_size > 1:
-    comm_overlap_cfg.bucket_size = 128 * 1024 * 1024
-    comm_overlap_cfg.overlap_grad_reduce = True
-    comm_overlap_cfg.overlap_param_gather = True
-```
-
-Launch-time env tuning:
-
-```570:609:src/megatron/bridge/recipes/run_plugins.py
-executor.env_vars["CUDA_DEVICE_MAX_CONNECTIONS"] = str(cuda_device_max_connections)
-...
-executor.env_vars["NVTE_FWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
-executor.env_vars["NVTE_BWD_LAYERNORM_SM_MARGIN"] = str(self.layernorm_sm_margin)
-```
-
-## Pitfalls
-
-1. TP overlap silently disables itself if `sequence_parallel=False` or Transformer Engine is unavailable.
-2. PP overlap is not enabled for all PP cases. Bridge only auto-selects `overlap_p2p_comm=True` when `PP > 1` and `VPP > 1`.
-3. `bucket_size` is a parameter-count knob, not a byte-size knob.
-4. `grad_reduce_in_fp32` and `fp8_param_gather` should be set through mixed precision, not as standalone DDP tuning first.
-5. `CUDA_DEVICE_MAX_CONNECTIONS` and LayerNorm SM margin are launch-time plugin settings, not `CommOverlapConfig` fields.
-
-## Verification
-
-Use the checked-in overlap unit coverage first:
-
-```bash
-uv run python -m pytest tests/unit_tests/training/test_comm_overlap.py -q
-```
-
-Optional second check if `nemo_run` is available:
-
-```bash
-uv run python -m pytest tests/unit_tests/recipes/test_run_plugins.py -q
-```
-
-Success criteria:
-
-- first command reports `26 passed`
-- second command validates plugin-owned env wiring when not skipped
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/cuda-graphs/SKILL.md
-```md
----
-name: cuda-graphs
-description: Validate and use CUDA graph capture in Megatron Bridge, including local full-iteration graphs and Transformer Engine scoped graphs for attention, MLP, and MoE modules.
----
-
-# CUDA Graphs
-
-Stable docs: `docs/training/cuda-graphs.md`
-Card: `card.yaml` (co-located)
-
-## What It Is
-
-CUDA graphs capture GPU operations once and replay them with minimal
-host-driver overhead. Bridge supports two implementations:
-
-| `cuda_graph_impl` | Mechanism | Scope support |
-|---|---|---|
-| `"local"` | MCore `FullCudaGraphWrapper` wrapping entire fwd+bwd | `full_iteration` |
-| `"transformer_engine"` | TE `make_graphed_callables()` per layer | `attn`, `mlp`, `moe`, `moe_router`, `moe_preprocess`, `mamba` |
-
-## Enablement
-
-### Local full-iteration graph
-
-```python
-cfg.model.cuda_graph_impl = "local"
-cfg.model.cuda_graph_scope = ["full_iteration"]
-cfg.model.cuda_graph_warmup_steps = 3
-cfg.model.use_te_rng_tracker = True
-cfg.rng.te_rng_tracker = True
-cfg.rerun_state_machine.check_for_nan_in_loss = False
-cfg.ddp.check_for_nan_in_grad = False
-```
-
-### TE scoped graph (dense model)
-
-```python
-cfg.model.cuda_graph_impl = "transformer_engine"
-cfg.model.cuda_graph_scope = ["attn"]           # or ["attn", "mlp"]
-cfg.model.cuda_graph_warmup_steps = 3
-cfg.model.use_te_rng_tracker = True
-cfg.rng.te_rng_tracker = True
-```
-
-### TE scoped graph (MoE model)
-
-```python
-cfg.model.cuda_graph_impl = "transformer_engine"
-cfg.model.cuda_graph_scope = ["attn", "moe_router", "moe_preprocess"]
-cfg.model.cuda_graph_warmup_steps = 3
-cfg.model.use_te_rng_tracker = True
-cfg.rng.te_rng_tracker = True
-```
-
-### Performance harness CLI
-
-```bash
-python scripts/performance/run_performance_workload.py \
-  --cuda_graph_impl transformer_engine \
-  --cuda_graph_scope attn moe_router moe_preprocess \
-  ...
-```
-
-Valid CLI values live in `scripts/performance/argument_parser.py`:
-- `VALID_CUDA_GRAPH_IMPLS`: `["none", "local", "transformer_engine"]`
-- `VALID_CUDA_GRAPH_SCOPES`: `["full_iteration", "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba"]`
-
-### Required constraints
-
-- `use_te_rng_tracker = True` (enforced in `gpt_provider.py`)
-- `full_iteration` scope only with `cuda_graph_impl = "local"`
-- `full_iteration` scope requires `check_for_nan_in_loss = False`
-- Do not combine `moe` scope and `moe_router` scope
-- Tensor shapes must be static (fixed seq_length, fixed micro_batch_size)
-- MoE token-dropless routing limits graphable scope to dense modules
-- With `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`, set
-  `NCCL_GRAPH_REGISTER=0` (MCore enforces for local impl on arch < sm_100;
-  TE impl asserts unconditionally)
-- CPU offloading is incompatible with CUDA graphs
-- `moe_preprocess` scope requires `moe_router` scope to also be set
-
-## Code Anchors
-
-### Bridge config and validation
-
-```1524:1531:src/megatron/bridge/training/config.py
-        # CUDA graph scope validation: check_for_nan_in_loss must be disabled with full_iteration graph
-        if self.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in self.model.cuda_graph_scope:
-            assert not self.rerun_state_machine.check_for_nan_in_loss, (
-                "check_for_nan_in_loss must be disabled when using full_iteration CUDA graph. "
-                "Set rerun_state_machine.check_for_nan_in_loss=False."
-            )
-        if self.model.cuda_graph_impl == "none":
-            self.model.cuda_graph_scope = []
-```
-
-### TE RNG tracker requirement
-
-```213:216:src/megatron/bridge/models/gpt_provider.py
-        if self.cuda_graph_impl != "none":
-            assert getattr(self, "use_te_rng_tracker", False), (
-                "Transformer engine's RNG tracker is required for cudagraphs, it can be "
-                "enabled with use_te_rng_tracker=True'."
-```
-
-### Graph creation and capture in training loop
-
-```231:255:src/megatron/bridge/training/train.py
-    # Capture CUDA Graphs.
-    cuda_graph_helper = None
-    if model_config.cuda_graph_impl == "transformer_engine":
-        cuda_graph_helper = TECudaGraphHelper(...)
-    # ...
-    if config.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in config.model.cuda_graph_scope:
-        forward_backward_func = FullCudaGraphWrapper(
-            forward_backward_func, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps
-        )
-```
-
-### TE graph capture after warmup
-
-```338:350:src/megatron/bridge/training/train.py
-        # Capture CUDA Graphs after warmup.
-        if (
-            model_config.cuda_graph_impl == "transformer_engine"
-            and cuda_graph_helper is not None
-            and not cuda_graph_helper.graphs_created()
-            and global_state.train_state.step - start_iteration == model_config.cuda_graph_warmup_steps
-        ):
-            if model_config.cuda_graph_warmup_steps > 0 and should_toggle_forward_pre_hook:
-                disable_forward_pre_hook(model, param_sync=False)
-            cuda_graph_helper.create_cudagraphs()
-            if model_config.cuda_graph_warmup_steps > 0 and should_toggle_forward_pre_hook:
-                enable_forward_pre_hook(model)
-                cuda_graph_helper.cuda_graph_set_manual_hooks()
-```
-
-### RNG initialization
-
-```199:206:src/megatron/bridge/training/initialize.py
-        _set_random_seed(
-            rng_config.seed,
-            rng_config.data_parallel_random_init,
-            rng_config.te_rng_tracker,
-            rng_config.inference_rng_tracker,
-            use_cudagraphable_rng=(model_config.cuda_graph_impl != "none"),
-            pg_collection=pg_collection,
-        )
-```
-
-### Delayed wgrad + CUDA graph interaction
-
-```522:555:src/megatron/bridge/training/comm_overlap.py
-            cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", []) or []
-            # ... scope parsing ...
-            if wgrad_in_graph_scope:
-                assert is_te_min_version("2.12.0"), ...
-                assert model_cfg.gradient_accumulation_fusion, ...
-                if attn_scope_enabled:
-                    assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, ...
-```
-
-### Perf harness override helper
-
-```102:124:scripts/performance/utils/overrides.py
-def _set_cuda_graph_overrides(
-    recipe, cuda_graph_impl=None, cuda_graph_scope=None
-):
-    # Sets impl, scope, and auto-enables te_rng_tracker
-```
-
-### Graph cleanup
-
-```1414:1441:src/megatron/bridge/training/train.py
-def _delete_cuda_graphs(cuda_graph_helper):
-    # Deletes FullCudaGraphWrapper and TE graph objects to free NCCL buffers
-```
-
-### MCore classes (in 3rdparty/Megatron-LM)
-
-- `CudaGraphManager`: `megatron/core/transformer/cuda_graphs.py`
-- `TECudaGraphHelper`: `megatron/core/transformer/cuda_graphs.py`
-- `FullCudaGraphWrapper`: `megatron/core/full_cuda_graph.py`
-- `CudaGraphScope` enum: `megatron/core/transformer/enums.py`
-
-### Positive recipe anchors
-
-- `scripts/performance/configs/deepseek/deepseek_workload_base_configs.py`
-- `scripts/performance/configs/qwen/qwen3_workload_base_configs.py`
-- `scripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py`
-
-### Tests
-
-| File | Coverage |
-|---|---|
-| `tests/unit_tests/training/test_config.py` | `full_iteration` NaN-check constraint |
-| `tests/unit_tests/training/test_comm_overlap.py` | `delay_wgrad` + CUDA graph interaction |
-| `tests/unit_tests/models/test_gpt_full_te_layer_autocast_spec.py` | TE autocast with CUDA graphs |
-| `tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py` | End-to-end local and TE graph smoke tests |
-| `tests/unit_tests/recipes/kimi/test_kimi_k2.py` | TE + CUDA graph recipe config |
-| `tests/unit_tests/recipes/gpt/test_gpt3_175b.py` | TE + CUDA graph recipe config |
-| `tests/unit_tests/recipes/qwen_vl/test_qwen25_vl_recipes.py` | VLM CUDA graph settings |
-
-## Pitfalls
-
-1. **TE RNG tracker is mandatory**: Setting `cuda_graph_impl` without
-   `use_te_rng_tracker=True` and `rng.te_rng_tracker=True` will assert
-   in the provider.
-
-2. **`full_iteration` requires NaN checks disabled**: The entire fwd+bwd is
-   captured, so loss-NaN checking cannot inspect intermediate values.
-
-3. **MoE scope restrictions**: `moe` scope and `moe_router` scope are
-   mutually exclusive. Token-dropless MoE can only graph `moe_router` and
-   `moe_preprocess`, not the full expert dispatch.
-
-4. **Memory overhead**: CUDA graphs pin all intermediate buffers for the
-   graph's lifetime (no memory reuse). TE scoped graphs add a few GB;
-   full-iteration graphs can increase peak memory by 1.5–2×. `PP > 1`
-   compounds overhead since each stage holds its own graph.
-
-5. **Delayed wgrad interaction**: When `delay_wgrad_compute=True` and
-   attention or MoE router is in `cuda_graph_scope`, additional constraints
-   apply: TE >= 2.12.0, `gradient_accumulation_fusion=True`, and no
-   attention bias.
-
-6. **Variable-length sequences break graphs**: Sequence lengths must be
-   constant across steps. Use padded packed sequences if packing is needed.
-
-7. **Graph cleanup is required**: CUDA graph objects hold NCCL buffer
-   references. Bridge handles this in `_delete_cuda_graphs()` at the end
-   of training, but early exits must call it explicitly.
-
-8. **Older GPU architectures**: On GPUs with compute capability < 10.0
-   (pre-Blackwell), set `NCCL_GRAPH_REGISTER=0` when using
-   `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`. Enforced in MCore
-   `CudaGraphManager` (cuda_graphs.py:1428) and `TECudaGraphHelper`
-   (cuda_graphs.py:1697). The TE impl asserts unconditionally regardless
-   of arch.
-
-9. **CPU offloading incompatible**: CUDA graphs cannot be used with CPU
-   offloading. Enforced in MCore `transformer_config.py:1907`.
-
-10. **MoE recompute + moe_router scope**: MoE recompute is not supported
-    with `moe_router` CUDA graph scope when using `cuda_graph_impl =
-    "transformer_engine"`. Enforced in MCore `transformer_config.py:1977`.
-
-## Verification
-
-### Unit tests
-
-```bash
-uv run python -m pytest \
-  tests/unit_tests/training/test_config.py -k "cuda_graph" \
-  tests/unit_tests/training/test_comm_overlap.py -k "cuda_graph" \
-  tests/unit_tests/models/test_gpt_full_te_layer_autocast_spec.py -k "cuda_graph" -q
-```
-
-### Functional smoke test (requires GPU)
-
-```bash
-uv run python -m pytest \
-  tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py -q
-```
-
-### Success criteria
-
-- Unit tests pass, covering config validation for both `local` and
-  `transformer_engine` implementations.
-- Functional test completes training steps with both CUDA graph
-  implementations.
-- No NCCL errors or illegal memory access in logs.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/config.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import warnings
-from abc import ABC, abstractmethod
-from dataclasses import MISSING, dataclass, field, fields
-from pathlib import Path
-from typing import Any, Dict, Literal, Optional, Tuple, Union
-
-import torch
-from megatron.core.datasets.gpt_dataset import GPTDatasetConfig as MCoreGPTDatasetConfig
-from megatron.core.distributed import DistributedDataParallelConfig as MCoreDistributedDataParallelConfig
-from megatron.core.optimizer import OptimizerConfig as MCoreOptimizerConfig
-from megatron.core.optimizer import (
-    ParamGroupOverride,
-    ParamKey,
-)
-from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig
-from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig
-from megatron.training.config import CheckpointConfig as MTrainCheckpointConfig
-from megatron.training.config import DistributedInitConfig as MTrainDistributedInitConfig
-from megatron.training.config import LoggerConfig as MTrainLoggerConfig
-from megatron.training.config import ProfilingConfig as MTrainProfilingConfig
-from megatron.training.config import RerunStateMachineConfig as MTrainRerunStateMachineConfig
-from megatron.training.config import RNGConfig, ValidationConfig
-from megatron.training.config import SchedulerConfig as MTrainSchedulerConfig
-from megatron.training.config import StragglerDetectionConfig as MTrainStragglerDetectionConfig
-from megatron.training.config import TrainingConfig as MTrainTrainingConfig
-
-from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-from megatron.bridge.models import GPTModelProvider, T5ModelProvider
-from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig
-from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig
-from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider
-from megatron.bridge.models.mimo.mimo_provider import MimoModelProvider
-from megatron.bridge.peft.base import PEFT
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.flex_dispatcher_backend import validate_flex_dispatcher_backend
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config
-from megatron.bridge.training.tokenizers.config import TokenizerConfig
-from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
-from megatron.bridge.training.utils.config_utils import _ConfigContainerBase as Container
-from megatron.bridge.utils.common_utils import (
-    get_world_size_safe,
-    print_rank_0,
-    warn_rank_0,
-)
-
-
-@dataclass
-class DistributedDataParallelConfig(MCoreDistributedDataParallelConfig):
-    """Megatron Core DistributedDataParallelConfig with deferred post-init.
-
-    This class inherits from Megatron Core's DistributedDataParallelConfig but defers the
-    execution of post_init() until finalize() is explicitly called. This allows
-    for field modifications after construction but before computed fields are calculated.
-    """
-
-    param_name_patterns_for_fp32_local_accumulation: Tuple[str, ...] = ()
-    """fnmatch patterns selecting parameters whose gradients should be locally
-    accumulated in FP32. The special pattern ``'all'`` matches every parameter.
-    Synced from MCore c586f6d56 (#4028); field will be inherited from the base
-    class after the next mcore bump."""
-
-    def __post_init__(self) -> None:
-        """Skip MCore post_init during initial construction.
-
-        The original post_init logic is deferred until finalize() is called.
-        """
-        pass
-
-    def finalize(self) -> None:
-        """Execute the deferred MCore post-init logic.
-
-        This method calls the original Megatron Core DistributedDataParallelConfig.__post_init__()
-        to compute derived fields based on the current field values.
-        """
-        super().__post_init__()
-
-
-@dataclass
-class OptimizerConfig(MCoreOptimizerConfig):
-    """Megatron Core OptimizerConfig with deferred post-init.
-
-    This class inherits from Megatron Core's OptimizerConfig but defers the
-    execution of post_init() until finalize() is explicitly called. This allows
-    for field modifications after construction but before computed fields are calculated.
-    """
-
-    def __post_init__(self) -> None:
-        """Skip MCore post_init during initial construction.
-
-        The original post_init logic is deferred until finalize() is called.
-        """
-        pass
-
-    def finalize(self) -> None:
-        """Execute the deferred MCore post-init logic.
-
-        This method calls the original Megatron Core OptimizerConfig.__post_init__()
-        to compute derived fields based on the current field values.
-        """
-        super().__post_init__()
-
-
-@dataclass(kw_only=True)
-class DistributedInitConfig(MTrainDistributedInitConfig):
-    """Configuration settings for distributed training initialization."""
-
-    external_gpu_device_mapping: bool = False
-    """If True, indicates that GPU device mapping has been externally managed
-    (e.g., via CUDA_VISIBLE_DEVICES environment variable). When True, uses device 0
-    instead of local rank for CUDA device selection. This is useful when launching
-    with external process managers that handle GPU visibility.
-    """
-
-    enable_megatron_core_experimental: bool = False
-    """Enable experimental features for Megatron Core."""
-
-    use_decentralized_pg: bool = False
-    """Use ProcessGroupCollection passed through functions instead of relying on mcore's
-    global parallel state (mpu) variables. When True, parallel groups are obtained from
-    the pg_collection object rather than the global megatron.core.parallel_state module."""
-
-    @property
-    def lazy_init(self) -> bool:
-        return self.lazy_mpu_init
-
-    @lazy_init.setter
-    def lazy_init(self, value: bool) -> None:
-        self.lazy_mpu_init = value
-
-
-@dataclass(kw_only=True)
-class RerunStateMachineConfig(MTrainRerunStateMachineConfig):
-    """Configuration for the rerun state machine used for result validation or stats."""
-
-    rerun_mode: Literal["disabled", "validate_results", "report_determinism_stats"] = "disabled"
-    """Use re-run engine to validate results (default) or to emit stats
-    on variability of computations due to non-deterministic algorithms."""
-
-    spiky_loss_factor: float = 10.0
-    """Factor for detecting spiky loss. A loss is considered spiky if it exceeds
-    this multiple of the max observed loss over the sample window."""
-
-
-@dataclass(kw_only=True)
-class DataloaderConfig:
-    """Base configuration for data loading."""
-
-    dataloader_type: Optional[Literal["single", "cyclic", "batch", "external"]] = None
-    """Dataloader type: 'single' for single pass, 'cyclic' for multiple passes with shuffling,
-    'batch' for global batch sampling (used in fine-tuning), or 'external' for custom dataloaders."""
-
-    num_workers: int = 2
-    """Dataloader number of workers."""
-
-    data_sharding: bool = True
-    """Disable data sharding."""
-
-    pin_memory: bool = True
-    """Whether to pin memory during data loading for faster GPU training."""
-
-    drop_last: bool = True
-    """Whether to drop the last incomplete batch."""
-
-    persistent_workers: bool = True
-    """Whether to keep data loading workers persistent across epochs.
-    Automatically set to False when num_workers is 0."""
-
-    trust_remote_code: Optional[bool] = None
-    """Whether remote code execution should be trusted for a given HF path."""
-
-    def finalize(self):
-        """Finalize dataloader config field constraints."""
-        if self.num_workers == 0 and self.persistent_workers:
-            self.persistent_workers = False
-
-
-@dataclass(frozen=True)
-class DatasetBuildContext:
-    """Interface that encapsulates framework internals.
-
-    This context provides metadata needed to build datasets
-    while hiding implementation details of the framework.
-
-    Attributes:
-        train_samples: Number of samples for training dataset
-        valid_samples: Number of samples for validation dataset
-        test_samples: Number of samples for test dataset
-        tokenizer: Optional tokenizer instance for text processing
-        pg_collection: Optional process group collection for distributed training
-    """
-
-    train_samples: int
-    valid_samples: int
-    test_samples: int
-    tokenizer: Optional[MegatronTokenizer] = None
-    pg_collection: Optional[ProcessGroupCollection] = None
-
-
-@dataclass(frozen=True)
-class OptimizerConfigOverrideProviderContext:
-    """Context for providing config overrides."""
-
-    scheduler_config: "SchedulerConfig"
-    optimizer_config: OptimizerConfig
-    model: Union[MegatronModule, list[MegatronModule]]
-
-
-@dataclass
-class OptimizerConfigOverrideProvider:
-    """Abstract base class for providing config overrides."""
-
-    def build_config_overrides(
-        self, context: OptimizerConfigOverrideProviderContext
-    ) -> dict[ParamKey, ParamGroupOverride] | None:
-        """Build config overrides for weight decay based on scheduler configuration.
-
-        This function creates parameter-specific overrides for weight decay behavior.
-        By default, weight decay is skipped for bias parameters and 1D parameters.
-        For Qwen3-Next models, weight decay is applied to q_layernorm and k_layernorm.
-
-        Args:
-            context: OptimizerConfigOverrideProviderContext which packages the scheduler
-                configuration, optimizer configuration, and model.
-
-        Returns:
-            Dictionary of ParamKey to ParamGroupOverride for the optimizer
-        """
-        model = context.model
-        scheduler_config = context.scheduler_config
-        optimizer_config = context.optimizer_config
-
-        config_overrides: dict[ParamKey, ParamGroupOverride] = {}
-
-        # Collect param names that should skip weight decay
-        # NOTE: this can be simplified once https://github.com/NVIDIA/Megatron-LM/pull/2753
-        #  is merged into dev. Then we can re-use megatron's apply_wd_to_qk_layernorm option
-        #  and call megatron.core.optimizer.get_standard_config_overrides(optimizer_config)
-        #  directly for standard settings, replacing the custom logic below for qwen3-next.
-        no_wd_names: list[str] = []
-        is_qwen3_next = scheduler_config.no_weight_decay_cond_type == "qwen3_next"
-
-        model_list = model if isinstance(model, list) else [model]
-        for model_chunk in model_list:
-            for name, param in model_chunk.named_parameters():
-                # Skip weight decay for bias parameters
-                if name.endswith(".bias"):
-                    no_wd_names.append(name)
-                    continue
-
-                # Skip weight decay for 1D parameters
-                if len(param.shape) == 1:
-                    if is_qwen3_next:
-                        # Qwen3-Next: apply weight decay to qk layernorm (don't add to skip list)
-                        if "q_layernorm" in name or "k_layernorm" in name:
-                            continue
-                    no_wd_names.append(name)
-
-        # Create a single ParamKey with all names that should skip weight decay
-        if no_wd_names:
-            no_wd_key = ParamKey(name=tuple(no_wd_names))
-            config_overrides[no_wd_key] = ParamGroupOverride(wd_mult=0.0)
-
-        # Now handle decoupled LR:
-        if optimizer_config.decoupled_lr is not None:
-            decoupled_lr_config: ParamGroupOverride = {"max_lr": optimizer_config.decoupled_lr}
-            decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter")
-            if optimizer_config.decoupled_min_lr is not None:
-                decoupled_lr_config["min_lr"] = optimizer_config.decoupled_min_lr
-            config_overrides[decoupled_param_key] = decoupled_lr_config
-
-        return config_overrides if config_overrides else None
-
-
-@dataclass
-class DatasetProvider(DataloaderConfig, ABC):
-    """Abstract base class for custom dataset configurations.
-
-    Provides an interface for users to implement their own dataset builders
-    while automatically inheriting all DataloaderConfig functionality.
-
-    Users must:
-    1. Inherit from this class
-    2. Implement the build_datasets() method
-
-    Example:
-        @dataclass
-        class S3DatasetConfig(DatasetProvider):
-            bucket_name: str
-            data_prefix: str
-            seq_length: int
-
-            def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]:
-                # Custom implementation to load data from S3
-                train_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/train", context.tokenizer)
-                valid_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/valid", context.tokenizer)
-                test_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/test", context.tokenizer)
-                return train_ds, valid_ds, test_ds
-    """
-
-    @abstractmethod
-    def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]:
-        """Build train, validation, and test datasets.
-
-        This method is called by the framework during dataset initialization.
-        Implementations should use the provided context to create appropriate
-        datasets for each split.
-
-        Args:
-            context: Build context with sample counts and tokenizer
-
-        Returns:
-            Tuple of (train_dataset, valid_dataset, test_dataset)
-            Any element can be None if that split shouldn't be created.
-
-        Raises:
-            NotImplementedError: Must be implemented by subclasses
-        """
-        pass
-
-
-@dataclass
-class GPTDatasetConfig(MCoreGPTDatasetConfig, DataloaderConfig):
-    """Megatron Core GPTDatasetConfig with deferred post-init.
-
-    This class inherits from MCore's GPTDatasetConfig and DataloaderConfig but defers the
-    execution of post_init() until finalize() is explicitly called. This allows
-    for field modifications after construction but before computed fields are calculated.
-    """
-
-    data_path: str | list[str] | None = None
-    """CLI-friendly alternative to ``blend``.  Accepts a single path string,
-    a space-separated multi-path string, or a list of paths (with optional
-    interleaved weights, matching Megatron-LM ``--data-path`` semantics).
-    Converted to ``blend`` automatically during ``finalize()``."""
-
-    def __init__(
-        self,
-        seq_length: int | None = None,
-        skip_getting_attention_mask_from_dataset: bool = True,
-        data_path: str | list[str] | None = None,
-        *args,
-        **kwargs,
-    ):
-        """
-        Args:
-            seq_length (int | None): the sequence length. If not provided, `sequence_length` must be in kwargs.
-            skip_getting_attention_mask_from_dataset (bool): if set, the dataset will pass a None attention mask
-                and the attention mask is autogenerated from the attn backend.
-            data_path: CLI-friendly data path(s). Converted to ``blend`` in ``finalize()``.
-        """
-        self.skip_getting_attention_mask_from_dataset = skip_getting_attention_mask_from_dataset
-        self.data_path = data_path
-
-        if seq_length is not None:
-            kwargs["sequence_length"] = seq_length
-        elif "sequence_length" not in kwargs:
-            raise ValueError("Either `seq_length` or `sequence_length` must be provided.")
-
-        dataloader_kwargs = {k: kwargs.pop(k) for k in list(kwargs) if k in DataloaderConfig.__dataclass_fields__}
-        MCoreGPTDatasetConfig.__init__(self, *args, **kwargs)
-        DataloaderConfig.__init__(self, **dataloader_kwargs)
-
-    def __post_init__(self) -> None:
-        """Skip MCore post_init during initial construction.
-
-        The original post_init logic is deferred until finalize() is called.
-        """
-        pass
-
-    @property
-    def seq_length(self):
-        """Alias for MCore's `sequence_length` field."""
-        return getattr(self, "sequence_length", None)
-
-    @seq_length.setter
-    def seq_length(self, value):
-        setattr(self, "sequence_length", value)
-
-    def finalize(self) -> None:
-        """Execute the deferred MCore post-init logic and Bridge-specific checks.
-
-        This method calls the original Megatron Core GPTDatasetConfig.__post_init__()
-        and then performs Bridge-specific validation.
-        """
-        if self.blend is None and self.data_path is not None:
-            from megatron.core.datasets.utils import get_blend_from_list
-
-            if isinstance(self.data_path, str):
-                paths = self.data_path.split()
-            else:
-                paths = list(self.data_path)
-            self.blend = get_blend_from_list(paths)
-
-        # Call MCore's post_init
-        super(MCoreGPTDatasetConfig, self).__post_init__()
-
-        assert self.reset_position_ids is not None, "reset_position_ids must be defined."
-        assert self.reset_attention_mask is not None, "reset_attention_mask must be defined."
-        assert self.eod_mask_loss is not None, "eod_mask_loss must be defined."
-
-        DataloaderConfig.finalize(self)
-
-
-@dataclass
-class GPTFIMDatasetConfig(GPTDatasetConfig):
-    """Configuration object forGPT FIM datasets"""
-
-    def __init__(
-        self,
-        fim_rate: float = None,
-        fim_spm_rate: float = None,
-        fim_extra_tokens: Dict = None,
-        fim_split_sample: Optional[str] = None,
-        fim_fragment_rate: Optional[float] = None,
-        fim_no_prefix: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            fim_rate: float: probability to convert a training sample into a FIM format.
-            fim_spm_rate (float): probability that the a FIM sample uses the SPM format over the PSM format.
-            fim_extra_tokens (Dict): should consist of prefix, middle, suffix, PAD, and EOD tokens.
-            fim_split_sample (str): string around which to split the sample for FIM.
-            fim_fragment_rate (float): rate of FIM on each fragment when split_sample is not None.
-            fim_no_prefix (str): do not apply FIM to fragments that start with this prefix.
-        """
-        self.fim_data = True
-        self.fim_rate = fim_rate
-        self.fim_spm_rate = fim_spm_rate
-        self.fim_extra_tokens = fim_extra_tokens
-        self.fim_split_sample = fim_split_sample
-        self.fim_fragment_rate = fim_fragment_rate
-        self.fim_no_prefix = fim_no_prefix
-
-        super().__init__(**kwargs)
-
-
-@dataclass
-class MockGPTDatasetConfig(GPTDatasetConfig):
-    """Modifies GPTDatasetConfig to enforce necessary options for creating a mock dataset."""
-
-    def __init__(
-        self,
-        seq_length: int,
-        **kwargs,
-    ):
-        super().__init__(seq_length=seq_length, **kwargs)
-
-    def finalize(self):
-        """ """
-        # Raise TypeError if `blend` or `blend_per_split` is not None
-        if self.__dict__.get("blend", None):
-            raise TypeError("got an unexpected keyword argument 'blend'")
-        if self.__dict__.get("blend_per_split", None):
-            raise TypeError("got an unexpected keyword argument 'blend_per_split'")
-        if self.__dict__.get("blend", None) and self.__dict__.get("blend_per_split", None):
-            raise TypeError("got an unexpected keyword argument")
-
-        # Drop `blend` and `blend_per_split` from __dict__
-        self.__dict__.pop("blend", None)
-        self.__dict__.pop("blend_per_split", None)
-
-        return super().finalize()
-
-
-@dataclass(kw_only=True)
-class FinetuningDatasetConfig(DataloaderConfig):
-    """Configuration specific to finetuning datasets, inheriting from DataloaderConfig.
-
-    Note: For fine-tuning, dataloader_type defaults to 'batch' which ensures sequences
-    within each global batch are padded to the same length.
-    """
-
-    dataloader_type: Optional[Literal["single", "cyclic", "batch", "external"]] = "batch"
-    """Dataloader type for fine-tuning. Defaults to 'batch' for optimal padding behavior."""
-
-    dataset_root: Optional[Union[str, Path]] = None
-    seq_length: int
-    seed: int = 1234
-    memmap_workers: int = 1
-    max_train_samples: Optional[int] = None
-    packed_sequence_specs: Optional[PackedSequenceSpecs] = None
-    dataset_kwargs: Optional[dict[str, Any]] = None
-    do_validation: bool = True
-    do_test: bool = True
-
-
-@dataclass(kw_only=True)
-class SchedulerConfig(MTrainSchedulerConfig):
-    """Configuration settings for the learning rate scheduler and weight decay."""
-
-    def finalize(self) -> None:
-        """Post-initialization checks for scheduler config."""
-        if self.start_weight_decay is not None:
-            assert self.start_weight_decay >= 0.0, "start_weight_decay should be positive."
-            assert self.end_weight_decay >= self.start_weight_decay
-
-        if self.override_opt_param_scheduler:
-            assert not self.use_checkpoint_opt_param_scheduler, "both override and use-checkpoint are set."
-
-        # Validate mutual exclusivity between iteration-based and sample-based scheduler fields
-        has_iter_fields = (
-            self.lr_decay_iters is not None or self.lr_warmup_iters != 0 or self.lr_wsd_decay_iters is not None
-        )
-        has_sample_fields = (
-            self.lr_decay_samples is not None or self.lr_warmup_samples != 0 or self.lr_wsd_decay_samples is not None
-        )
-
-        assert not (has_iter_fields and has_sample_fields), (
-            f"Cannot mix iteration-based and sample-based scheduler fields. "
-            f"Found iteration fields: lr_decay_iters={self.lr_decay_iters}, lr_warmup_iters={self.lr_warmup_iters}, lr_wsd_decay_iters={self.lr_wsd_decay_iters}. "
-            f"Found sample fields: lr_decay_samples={self.lr_decay_samples}, lr_warmup_samples={self.lr_warmup_samples}, lr_wsd_decay_samples={self.lr_wsd_decay_samples}. "
-            f"Use either iteration fields OR sample fields, not both."
-        )
-
-        # Validate mutual exclusivity between lr_warmup_fraction and specific warmup fields
-        if self.lr_warmup_fraction is not None:
-            assert self.lr_warmup_iters == 0 and self.lr_warmup_samples == 0, (
-                f"Cannot specify lr_warmup_fraction={self.lr_warmup_fraction} with lr_warmup_iters={self.lr_warmup_iters} or lr_warmup_samples={self.lr_warmup_samples}. "
-                f"Use either lr_warmup_fraction OR lr_warmup_iters OR lr_warmup_samples."
-            )
-
-
-@dataclass(kw_only=True)
-class TrainingConfig(MTrainTrainingConfig):
-    """Configuration settings related to the training loop and validation."""
-
-    check_optimizer_step_success: bool = True
-    """Checks optimizer.step() succeeded at each training step ."""
-
-    skip_sync_grad_norm_across_mp: bool = False
-    """Skips syncing the grad norm across the model parallel group."""
-
-    # ---------------- Validation config. ----------------
-
-    eval_iters: int | None = None
-    """Number of iterations to run for evaluation validation/test for. Deprecated in favor of ValidationConfig."""
-
-    eval_interval: int | None = None
-    """Interval between running evaluation on validation set. Deprecated in favor of ValidationConfig."""
-
-    skip_train: bool | None = None
-    """If set, bypass the training loop, optionally do evaluation for validation/test, and exit. Deprecated in favor of ValidationConfig."""
-
-    def finalize(self) -> None:
-        """Validate training mode specification and calculate train_iters from train_samples if needed."""
-        has_train_iters = self.train_iters is not None
-        has_train_samples = self.train_samples is not None
-
-        assert has_train_iters or has_train_samples, "Either train_iters or train_samples must be provided"
-        assert not (has_train_iters and has_train_samples), "Cannot specify both train_iters and train_samples"
-        if has_train_samples:
-            assert self.train_samples > 0, "train_samples must be positive"
-            assert self.rampup_batch_size is None, "Batch size rampup not supported with sample-based training yet"
-
-            # Calculate train_iters from train_samples (rampup_batch_size already validated as None)
-            self.train_iters = self.train_samples // self.global_batch_size
-            print_rank_0(f"Setting training iterations to {self.train_iters} based on {self.train_samples} samples")
-
-
-@dataclass(kw_only=True)
-class CheckpointConfig(MTrainCheckpointConfig):
-    """Configuration settings for model checkpointing (saving and loading)."""
-
-    pretrained_checkpoint: Optional[str] = None
-    """Directory containing a pretrained model checkpoint for finetuning.
-
-    This can be either:
-      - A parent checkpoint directory (e.g. ``/checkpoints/my_model/``) that
-        contains tracker files (``latest_train_state.pt``) and ``iter_*``
-        subdirectories.
-      - A specific iteration directory (e.g.
-        ``/checkpoints/my_model/iter_0001000/``) that directly contains the
-        checkpoint payload (``run_config.yaml``, weight shards, etc.).
-    """
-
-    storage_writers_per_rank: int = 1
-    """Number of storage writers per rank for torch_dist checkpoint format.
-    Affects the number of checkpoint files: saving_ranks * storage_writers_per_rank."""
-
-    use_persistent_ckpt_worker: bool = True
-    """Use a persistent background worker for async checkpoint saves. When enabled, creates a dedicated
-    worker thread/process for handling async saves. When disabled, uses temporal workers that are
-    created and destroyed for each save operation."""
-
-    async_strategy: str = "nvrx"
-    """Async checkpoint strategy to use. Options: ``"nvrx"`` (default) or ``"mcore"``.
-    The ``"nvrx"`` strategy uses nvidia_resiliency_ext for async checkpointing and falls back
-    to ``"mcore"`` if the package is not installed."""
-
-    async_write_results_mp_mode: str = "fork"
-    """Multiprocessing start method for the async write results queue.
-    Options: ``"fork"`` (default), ``"spawn"``, ``"forkserver"``."""
-
-    strict_fsdp_dtensor_load: bool = False
-    """Whether to enforce strict loading for FSDP DTensor checkpoints. When False, allows partial loading."""
-
-    custom_manager_class: str | None = None
-    """Fully qualified class name for a custom CheckpointManager implementation.
-
-    When set, checkpoint operations will instantiate and delegate to this class instead of the default
-    checkpoint manager. The custom class must implement the `CheckpointManager` protocol
-    defined in `megatron.bridge.training.checkpointing`.
-
-    Example: ``'mypackage.checkpoint.MyCheckpointManager'``
-    """
-
-    def finalize(self) -> None:
-        """Post-initialization checks for checkpoint config."""
-        if self.pretrained_checkpoint is not None:
-            from megatron.bridge.training.utils.checkpoint_utils import file_exists
-
-            assert file_exists(self.pretrained_checkpoint), (
-                f"Pretrained checkpoint {self.pretrained_checkpoint} does not exist"
-            )
-
-        if self.load_main_params_from_ckpt:
-            assert not self.load_optim, "load_main_params_from_ckpt must be used with load_optim=False"
-
-        if self.async_save:
-            assert self.save is not None, "async_save is enabled, but save is not set. Set save to a valid path."
-            assert self.use_persistent_ckpt_worker, "async_save requires use_persistent_ckpt_worker=True."
-
-        # Validate ckpt_step if specified
-        if self.ckpt_step is not None:
-            if self.load is None:
-                raise ValueError(
-                    f"ckpt_step={self.ckpt_step} specified but checkpoint.load is None. "
-                    f"Please set checkpoint.load to the base checkpoint directory."
-                )
-
-        if self.dist_ckpt_optim_fully_reshardable:
-            assert not self.distrib_optim_fully_reshardable_mem_efficient, (
-                "distrib_optim_fully_reshardable_mem_efficient requires use_gloo_process_groups"
-            )
-
-
-@dataclass(kw_only=True)
-class LoggerConfig(MTrainLoggerConfig):
-    """Configuration settings for logging, including TensorBoard and WandB."""
-
-    skip_train_metrics_log: bool = False
-    """Skips logging of training metrics to all logging backends and to the console as well."""
-
-    timing_log_level: Literal[-1, 0, 1, 2] = 0
-    """Granularity level to measure and report timing.
-    -1: To disable timing logging as the timer start from 0 and above.
-    0: report only iteration time and make sure timing does not introduce extra overhead.
-    1: report timing for operations that are executed very limited times (basically once) during each iteration
-        (such as gradient all-reduce)
-    2: report timing for operations that migh be executed numerous times during each iteration.
-    Note that setting the level to 1 or 2 might cause increase in iteration time.
-    """
-
-    mlflow_experiment: Optional[str] = None
-    """The MLFlow experiment name."""
-
-    mlflow_run_name: Optional[str] = None
-    """The MLFlow run name."""
-
-    mlflow_tracking_uri: Optional[str] = None
-    """Optional MLFlow tracking URI."""
-
-    mlflow_tags: Optional[dict[str, str]] = None
-    """Optional tags to apply to the MLFlow run."""
-
-    comet_project: Optional[str] = None
-    """The Comet ML project name. Comet logging is disabled when this is None."""
-
-    comet_experiment_name: Optional[str] = None
-    """The Comet ML experiment name."""
-
-    comet_workspace: Optional[str] = None
-    """The Comet ML workspace. If not set, uses the default workspace for the API key."""
-
-    comet_api_key: Optional[str] = None
-    """The Comet ML API key. Can also be set via COMET_API_KEY environment variable."""
-
-    comet_tags: Optional[list[str]] = None
-    """Optional list of tags to apply to the Comet ML experiment."""
-
-    logging_level: int = logging.INFO
-    """Set default logging level"""
-
-    def finalize(self) -> None:
-        """Validate logger settings and optional MLFlow dependency."""
-        if self.mlflow_experiment and (self.mlflow_run_name is None or self.mlflow_run_name == ""):
-            raise ValueError("Set logger.mlflow_run_name when enabling MLFlow logging.")
-
-        using_mlflow = any(
-            [
-                self.mlflow_experiment,
-                self.mlflow_run_name,
-                self.mlflow_tracking_uri,
-                self.mlflow_tags,
-            ]
-        )
-
-        if using_mlflow:
-            try:
-                import importlib
-
-                importlib.import_module("mlflow")
-            except ModuleNotFoundError as exc:
-                raise ModuleNotFoundError(
-                    "MLFlow logging is configured, but the 'mlflow' package is not installed. "
-                    "Install it via pip install mlflow or uv add mlflow"
-                ) from exc
-
-        if self.comet_project and (self.comet_experiment_name is None or self.comet_experiment_name == ""):
-            raise ValueError("Set logger.comet_experiment_name when enabling Comet ML logging.")
-
-        using_comet = any(
-            [
-                self.comet_project,
-                self.comet_experiment_name,
-                self.comet_workspace,
-                self.comet_api_key,
-                self.comet_tags,
-            ]
-        )
-
-        if using_comet:
-            try:
-                import importlib
-
-                importlib.import_module("comet_ml")
-            except ModuleNotFoundError as exc:
-                raise ModuleNotFoundError(
-                    "Comet ML logging is configured, but the 'comet_ml' package is not installed. "
-                    "Install it via pip install comet-ml or uv add comet-ml"
-                ) from exc
-
-
-@dataclass(kw_only=True)
-class ProfilingConfig(MTrainProfilingConfig):
-    """Configuration settings for profiling the training process."""
-
-    def finalize(self) -> None:
-        """Validate profiling configuration."""
-        assert not (self.use_pytorch_profiler and self.use_nsys_profiler), (
-            "Exactly one of pytorch or nsys profiler should be enabled, not both, when ProfilingConfig is active."
-        )
-        assert self.profile_step_start >= 0, f"profile_step_start must be >= 0, got {self.profile_step_start}"
-        assert self.profile_step_end >= 0, f"profile_step_end must be >= 0, got {self.profile_step_end}"
-        assert self.profile_step_end >= self.profile_step_start, (
-            f"profile_step_end ({self.profile_step_end}) must be >= profile_step_start ({self.profile_step_start})"
-        )
-
-
-@dataclass(kw_only=True)
-class TensorInspectConfig:
-    """Configuration for Nvidia-DL-Framework-Inspect integration."""
-
-    enabled: bool = False
-    """Enable tensor inspection and statistics collection."""
-
-    features: dict[str, Any] | str | Path | None = None
-    """Feature configuration as a Python dict or a YAML file path."""
-
-    feature_dirs: list[str] | None = None
-    """Directories containing feature implementations (searched recursively)."""
-
-    log_dir: str | None = None
-    """Root directory to store inspection logs/statistics. Defaults to checkpoint save dir if unset."""
-
-    init_training_step: int = 0
-    """Initial training step for the inspector (used when resuming)."""
-
-    def finalize(self) -> None:
-        """Populate sensible defaults when inspection is enabled.
-
-        - If feature_dirs is unset, default to the installed TransformerEngine
-          debug features package path (transformer_engine.debug.features), when available.
-        """
-        if not self.enabled:
-            return
-        if not self.feature_dirs:
-            try:
-                import importlib
-
-                te_features_mod = importlib.import_module("transformer_engine.debug.features")
-                te_features_dir = Path(te_features_mod.__file__).parent
-                if te_features_dir.exists():
-                    self.feature_dirs = [str(te_features_dir)]
-            except Exception:
-                pass
-
-
-@dataclass
-class FaultToleranceConfig:
-    """Configuration settings related to fault tolerance mechanisms (NVIDIA internal use)."""
-
-    enable_ft_package: bool = False
-    """If set, Fault Tolerance package is enabled. Note: This feature is for Nvidia internal use only."""
-
-    calc_ft_timeouts: bool = False
-    """If set, FT package will try to automatically compute the timeouts.
-    Note: This feature is for Nvidia internal use only.
-    """
-
-    simulate_fault: bool = False
-    """Sets a simulated fault for fault tolerance. NOTE: This if for fault tolerance testing only."""
-
-    simulated_fault_type: Literal["rank_hung", "rank_killed", "random"] = "random"
-    """How the simulated fault should behave. 'random' will randomly choose one of the other two options."""
-
-    simulated_fault_rank: Optional[int] = None
-    """Rank on which simulated fault should occur."""
-
-    simulated_fault_base_delay: int = 0
-    """Base delay before simulated fault thread is started. A small random delay is added to this."""
-
-
-@dataclass(kw_only=True)
-class StragglerDetectionConfig(MTrainStragglerDetectionConfig):
-    """Configuration settings for detecting and logging GPU stragglers."""
-
-    enable_straggler_on_startup: bool = True
-    """If set, StragglerDetector is enabled on startup."""
-
-
-@dataclass
-class NVRxStragglerDetectionConfig:
-    """Configuration settings for NVIDIA Resiliency Extension straggler detection."""
-
-    enabled: bool = False
-    """Enable NVRx straggler detection."""
-
-    report_time_interval: float = 300.0
-    """Interval [seconds] of the straggler check."""
-
-    calc_relative_gpu_perf: bool = True
-    """Calculate relative GPU performance scores."""
-
-    calc_individual_gpu_perf: bool = True
-    """Calculate individual GPU performance scores."""
-
-    num_gpu_perf_scores_to_print: int = 5
-    """How many best and worst perf scores to print (0 - does not print periodically,
-    but only if stragglers are detected)."""
-
-    gpu_relative_perf_threshold: float = 0.7
-    """Threshold for relative GPU performance scores."""
-
-    gpu_individual_perf_threshold: float = 0.7
-    """Threshold for individual GPU performance scores."""
-
-    stop_if_detected: bool = False
-    """Set to True, to terminate the workload if stragglers are detected."""
-
-    enable_logging: bool = True
-    """Set to True, to log GPU performance scores."""
-
-    profiling_interval: int = 1
-    """Profiling interval passed to straggler.Detector.initialize."""
-
-    logger_name: str = "megatron.bridge.NVRxStragglerDetection"
-    """Logger name for straggler detection messages."""
-
-    def finalize(self) -> None:
-        """Validate NVRx straggler detection configuration."""
-        if self.enabled:
-            if not (self.calc_relative_gpu_perf or self.calc_individual_gpu_perf):
-                raise ValueError(
-                    "At least one of calc_relative_gpu_perf or calc_individual_gpu_perf must be True "
-                    "when NVRx straggler detection is enabled."
-                )
-            if self.report_time_interval <= 0:
-                raise ValueError("report_time_interval must be positive.")
-            if not (0.0 <= self.gpu_relative_perf_threshold <= 1.0):
-                raise ValueError("gpu_relative_perf_threshold must be between 0.0 and 1.0.")
-            if not (0.0 <= self.gpu_individual_perf_threshold <= 1.0):
-                raise ValueError("gpu_individual_perf_threshold must be between 0.0 and 1.0.")
-
-
-@dataclass
-class InProcessRestartConfig:
-    """Configuration settings for NVIDIA Resiliency Extension in-process restart functionality."""
-
-    enabled: bool = False
-    """Enable in-process restart mechanism from nvidia-resiliency-ext."""
-
-    max_iterations: Optional[int] = None
-    """Maximum number of in-process restart iterations."""
-
-    monitor_thread_interval: float = 1.0
-    """Monitoring interval (in seconds) for the monitoring thread."""
-
-    monitor_process_interval: float = 1.0
-    """Monitoring interval (in seconds) for the monitoring process."""
-
-    progress_watchdog_interval: float = 1.0
-    """Interval (in seconds) for automatic progress watchdog timestamp updates."""
-
-    heartbeat_interval: float = 30.0
-    """Monitoring interval (in seconds) for detecting unresponsive ranks."""
-
-    soft_timeout: float = 60.0
-    """Soft progress timeout (in seconds)."""
-
-    hard_timeout: float = 90.0
-    """Hard progress timeout (in seconds)."""
-
-    heartbeat_timeout: float = 60.0
-    """Timeout (in seconds) for a missing rank detection heartbeat."""
-
-    barrier_timeout: float = 120.0
-    """Timeout (in seconds) for internal distributed barrier."""
-
-    completion_timeout: float = 120.0
-    """Timeout (in seconds) for barrier on completion on all ranks."""
-
-    last_call_wait: float = 1.0
-    """Time interval (in seconds) for other ranks to report concurrent terminal failures."""
-
-    termination_grace_time: float = 1.0
-    """Interval (in seconds) between SIGTERM and SIGKILL issued on hard timeout."""
-
-    granularity: Literal["node", "rank"] = "node"
-    """Granularity for in-process restart."""
-
-    active_world_size: Optional[int] = None
-    """The number of ranks initially executing the workload.
-    The remaining ranks from the allocation are set aside as warm reserve.
-    If None, defaults to WORLD_SIZE environment variable."""
-
-    empty_cuda_cache: bool = True
-    """Empty CUDA cache during restart finalization."""
-
-    max_rank_faults: Optional[int] = None
-    """Maximum number of rank faults allowed before terminating the job."""
-
-    monitor_process_logdir: Optional[str] = None
-    """Directory for monitor process log files. If None, monitor process logging is disabled."""
-
-
-# ---------------- Container config (standalone top-level config) ----------------
-@dataclass(kw_only=True)
-class ConfigContainer(Container):
-    """Top-level container holding all configuration objects."""
-
-    rng: RNGConfig = field(default_factory=RNGConfig)
-    rerun_state_machine: RerunStateMachineConfig = field(default_factory=RerunStateMachineConfig)
-    train: TrainingConfig
-    model: (
-        GPTModelProvider | T5ModelProvider | MambaModelProvider | MimoModelProvider | GPTModelConfig | MambaModelConfig
-    )
-    optimizer: OptimizerConfig
-    optimizer_config_override_provider: OptimizerConfigOverrideProvider = field(
-        default_factory=OptimizerConfigOverrideProvider
-    )
-    ddp: DistributedDataParallelConfig = field(default_factory=DistributedDataParallelConfig)
-    validation: ValidationConfig = field(default_factory=ValidationConfig)
-    scheduler: SchedulerConfig
-    dataset: GPTDatasetConfig | FinetuningDatasetConfig | DatasetProvider
-    logger: LoggerConfig
-    tokenizer: TokenizerConfig
-    checkpoint: CheckpointConfig
-    dist: DistributedInitConfig = field(default_factory=DistributedInitConfig)
-    ft: Optional[FaultToleranceConfig] = None
-    straggler: Optional[StragglerDetectionConfig] = None
-    nvrx_straggler: Optional[NVRxStragglerDetectionConfig] = None
-    profiling: ProfilingConfig = field(default_factory=ProfilingConfig)
-    peft: Optional[PEFT] = None
-    comm_overlap: Optional[CommOverlapConfig] = None
-    mixed_precision: Optional[Union[MixedPrecisionConfig, str]] = None
-    tensor_inspect: TensorInspectConfig | None = None
-    inprocess_restart: Optional[InProcessRestartConfig] = None
-
-    def get_data_parallel_size(self, world_size: int) -> int:
-        """Calculate the data parallel size based on the model configuration."""
-        model_cfg = self.model
-        total_model_size = (
-            model_cfg.tensor_model_parallel_size
-            * model_cfg.pipeline_model_parallel_size
-            * model_cfg.context_parallel_size
-        )
-        assert world_size % total_model_size == 0, f"""
-        world size ({world_size}) is not divisible by total_model_size ({model_cfg.tensor_model_parallel_size=} * {model_cfg.pipeline_model_parallel_size=} * {model_cfg.context_parallel_size=})
-        """
-        return world_size // total_model_size
-
-    def set_data_parallel_size(self) -> None:
-        """Calculate and set data_parallel_size for this config and comm_overlap config.
-
-        This method calculates the data parallel size needed by setup methods, without
-        triggering full validation or finalization of Megatron Core configs.
-        """
-        # Calculate data parallel size (needed for comm overlap setup)
-        world_size = get_world_size_safe()
-        self.data_parallel_size = self.get_data_parallel_size(world_size)
-
-        # Set data_parallel_size on comm_overlap config if present
-        if self.comm_overlap is not None:
-            self.comm_overlap.data_parallel_size = self.data_parallel_size
-
-    def _validate_and_apply_deterministic_mode(self) -> None:
-        """Apply and validate deterministic mode requirements.
-
-        This enforces restrictions and settings that must hold when
-        the model is configured to run in deterministic mode.
-        """
-        if not getattr(self.model, "deterministic_mode", False):
-            return
-
-        # Disallow flash attention when running deterministically
-        if getattr(self.model, "attention_backend", None) == AttnBackend.flash:
-            raise AssertionError("Flash attention can not be used in deterministic mode.")
-
-        # Disallow cross-entropy loss fusion as it is not deterministic
-        assert not getattr(self.model, "cross_entropy_loss_fusion", False), (
-            "Cross Entropy Fusion is currently not deterministic."
-        )
-
-        all_reduce_choices = ("Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS")
-        assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, (
-            f"NCCL_ALGO must be one of {all_reduce_choices}."
-        )
-
-        # Enable deterministic algorithms in torch
-        torch.use_deterministic_algorithms(True)
-
-    def validate(self) -> None:
-        """Performs validation checks on the combined configuration.
-
-        Calculates dependent values like data_parallel_size and scheduler steps.
-        Ensures compatibility between different configuration settings.
-        """
-
-        # Propagate in-batch packing flag to model config so TransformerConfig.finalize()
-        # can enable variable_seq_lengths for pipeline parallelism.
-        if getattr(self.dataset, "pack_sequences_in_batch", False):
-            self.model._pack_sequences_in_batch = True
-
-        if hasattr(self.dataset, "finalize"):
-            self.dataset.finalize()
-        if hasattr(self.ddp, "finalize"):
-            self.ddp.finalize()
-        if hasattr(self.optimizer, "finalize"):
-            self.optimizer.finalize()
-        if hasattr(self.model, "finalize"):
-            self.model.finalize()
-
-        self.logger.finalize()
-        self.train.finalize()
-        self.scheduler.finalize()
-        self.checkpoint.finalize()
-        if self.profiling is not None:
-            self.profiling.finalize()
-        if self.nvrx_straggler is not None:
-            self.nvrx_straggler.finalize()
-        if self.tensor_inspect is not None:
-            self.tensor_inspect.finalize()
-
-        # Sync config. If TE RNG tracker is set in either ways, set them in both places.
-        if self.rng.te_rng_tracker or self.model.use_te_rng_tracker:
-            self.model.use_te_rng_tracker = self.rng.te_rng_tracker = True
-
-        # Re-run post-inits of sub-configs
-        for f in fields(self):
-            sub_cfg = getattr(self, f.name)
-            if hasattr(sub_cfg, "__post_init__") and not hasattr(sub_cfg, "finalize"):
-                sub_cfg.__post_init__()
-
-        # Distributed - ensure data_parallel_size is calculated (might already be set by set_data_parallel_size)
-        if not hasattr(self, "data_parallel_size") or self.data_parallel_size is None:
-            world_size = get_world_size_safe()
-            self.data_parallel_size = self.get_data_parallel_size(world_size)
-            # Set data_parallel_size on comm_overlap config if present
-            if self.comm_overlap is not None:
-                self.comm_overlap.data_parallel_size = self.data_parallel_size
-
-        # Deterministic mode validations and settings
-        self._validate_and_apply_deterministic_mode()
-
-        # Run validations
-        _validate_and_sync_distributed_optimizer_settings(self)
-        _validate_mixed_precision_consistency(self)
-        _validate_fine_grained_activation_offloading(self)
-
-        # CUDA graph scope validation: check_for_nan_in_loss must be disabled with full_iteration graph
-        if self.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in self.model.cuda_graph_scope:
-            assert not self.rerun_state_machine.check_for_nan_in_loss, (
-                "check_for_nan_in_loss must be disabled when using full_iteration CUDA graph. "
-                "Set rerun_state_machine.check_for_nan_in_loss=False."
-            )
-        if self.model.cuda_graph_impl == "none":
-            self.model.cuda_graph_scope = []
-
-        if self.dist.use_megatron_fsdp and self.dist.use_torch_fsdp2:
-            raise ValueError("Using use_megatron_fsdp and use_torch_fsdp2 at the same time is not supported.")
-
-        # Megatron FSDP Config checks
-        if self.dist.use_megatron_fsdp or self.ddp.use_megatron_fsdp:
-            # Set Megatron FSDP Configs
-            self.dist.use_megatron_fsdp = True
-            self.ddp.use_megatron_fsdp = True
-
-            assert not self.dist.use_tp_pp_dp_mapping, "use_tp_pp_dp_mapping is not supported with Megatron FSDP"
-
-            if self.checkpoint.save is not None or self.checkpoint.load is not None:
-                # only check if saving or loading
-                assert self.checkpoint.ckpt_format == "fsdp_dtensor", (
-                    "Megatron FSDP only supports fsdp_dtensor checkpoint format"
-                )
-
-            if self.ddp.average_in_collective and not self.ddp.disable_symmetric_registration:
-                print_rank_0(
-                    "average_in_collective is not supported with NCCL symmetric registration, setting to False"
-                )
-                self.ddp.average_in_collective = False
-
-            # reuse_grad_buf_for_mxfp8_param_ag is not supported with Megatron FSDP
-            if self.ddp.reuse_grad_buf_for_mxfp8_param_ag:
-                print_rank_0("reuse_grad_buf_for_mxfp8_param_ag is not supported with Megatron FSDP, setting to False")
-                self.ddp.reuse_grad_buf_for_mxfp8_param_ag = False
-            if self.optimizer.reuse_grad_buf_for_mxfp8_param_ag:
-                self.optimizer.reuse_grad_buf_for_mxfp8_param_ag = False
-
-        # ModelOpt/Quantization checks
-        if getattr(self.model, "restore_modelopt_state", False):
-            assert not self.model.gradient_accumulation_fusion, (
-                "Gradient accumulation fusion is not supported with ModelOpt/Quantized models. "
-                "Please set model.gradient_accumulation_fusion=False"
-            )
-
-        # Checkpoint
-        if self.checkpoint.save is not None or self.checkpoint.load is not None:
-            # only check if saving or loading
-            if self.checkpoint.ckpt_format == "fsdp_dtensor":
-                assert self.ddp.use_megatron_fsdp and not self.dist.use_torch_fsdp2, (
-                    "fsdp_dtensor checkpoint format only supports Megatron FSDP"
-                )
-
-        # Enforce async_save format restriction
-        if self.checkpoint.async_save:
-            assert self.checkpoint.ckpt_format == "torch_dist", (
-                "async_save is only supported with ckpt_format='torch_dist'"
-            )
-
-        # Set defaults for tensor inspect callback
-        if self.tensor_inspect is not None and self.tensor_inspect.enabled:
-            if self.tensor_inspect.log_dir is None:
-                self.tensor_inspect.log_dir = self.checkpoint.save or "."
-            if self.tensor_inspect.init_training_step == 0 and self.checkpoint.ckpt_step is not None:
-                self.tensor_inspect.init_training_step = int(self.checkpoint.ckpt_step)
-
-        self.model.use_cpu_initialization = self.model.use_cpu_initialization or self.dist.lazy_mpu_init
-
-        # Gloo process groups are not supported when using decentralized process groups (NCCL only).
-        if self.dist.use_decentralized_pg:
-            assert not self.dist.use_gloo_process_groups, (
-                "Gloo process groups are not supported when use_decentralized_pg=True. "
-                "Decentralized process groups only support NCCL backend."
-            )
-
-        # Make sure all functionality that requires Gloo process groups is disabled.
-        if not self.dist.use_gloo_process_groups:
-            if self.optimizer.use_distributed_optimizer:
-                # If using distributed optimizer, must use distributed checkpointing.
-                # Legacy checkpointing uses Gloo process groups to collect full distributed
-                # optimizer state in the CPU memory of DP rank 0.
-                assert self.checkpoint.ckpt_format == "torch_dist"
-
-        # Cross-validation between training and scheduler configs
-        self._validate_training_scheduler_compatibility()
-
-        # Calculate scheduler steps for both iteration-based and sample-based training
-        self._calculate_scheduler_steps()
-
-        if self.model.context_parallel_size > 1:
-            assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, (
-                "Sequence length must be divisible by 2 * context parallel size if context parallel is used."
-            )
-            if isinstance(self.dataset, FinetuningDatasetConfig):
-                # check calculate_per_token_loss to be True
-                # check average_in_collective to be False
-                # for context parallel to solve the issue of nan loss on ranks with all tokens masked
-                # (only happens in SFT)
-                assert self.model.calculate_per_token_loss, (
-                    "When finetuning with CP>1, calculate_per_token_loss must be True"
-                )
-                assert not self.ddp.average_in_collective, (
-                    "When finetuning with CP>1, average_in_collective must be False"
-                )
-
-        self._validate_cp_comm_type()
-
-        if (
-            isinstance(self.dataset, FinetuningDatasetConfig)
-            and self.dataset.packed_sequence_specs is not None
-            and self.dataset.packed_sequence_specs.packed_sequence_size > 0
-            and self.train.micro_batch_size > 1
-        ):
-            packed_sequence_size = self.dataset.packed_sequence_specs.packed_sequence_size
-            raise ValueError(
-                "Micro batch size should be 1 when training with packed sequence, but your micro batch size "
-                f"is {self.train.micro_batch_size}. \nThe following config is equivalent to your current setting for "
-                f"a packed dataset. Please update your config to the following: \n"
-                f"Set micro batch size to 1 (currently {self.train.micro_batch_size})\n"
-                f"Set global batch size to {self.train.global_batch_size // self.train.micro_batch_size} "
-                f"(currently {self.train.global_batch_size}) \n"
-                f"Set packed sequence length to {packed_sequence_size * self.train.micro_batch_size} "
-                f"(currently {packed_sequence_size}) \n"
-                f"For details please visit "
-                f"https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html"
-            )
-
-        if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1:
-            raise ValueError(
-                "micro_batch_size should be greater than 1 when using pack_sequences_in_batch=True. "
-                "In-batch packing concatenates multiple sequences within a microbatch, so at least 2 sequences "
-                "are required per micro-batch."
-            )
-
-        if self.peft is not None:
-            assert self.checkpoint.pretrained_checkpoint is not None, "PEFT requires a pretrained checkpoint path"
-
-        if self.dataset is not None:
-            # Only validate sequence length for GPTDatasetConfig or FinetuningDatasetConfig
-            # DatasetProvider instances may not have sequence_length attributes
-            if isinstance(self.dataset, (GPTDatasetConfig, FinetuningDatasetConfig)):
-                data_seq_length = (
-                    self.dataset.seq_length
-                    if isinstance(self.dataset, FinetuningDatasetConfig)
-                    else self.dataset.seq_length
-                )
-
-                assert self.model.seq_length == data_seq_length, (
-                    f"Please ensure sequence length configuration in model config and "
-                    f"dataset config match.\nSequence length in model config: {self.model.seq_length}, "
-                    f"Sequence length in dataset config: {data_seq_length}"
-                )
-
-        # Validate DeepEP or HybridEP is supported for the current GPU architecture
-        if isinstance(self.model, (GPTModelConfig, MambaModelConfig)):
-            validate_flex_dispatcher_backend(self.model.transformer)
-        else:
-            validate_flex_dispatcher_backend(self.model)
-
-        for f in fields(ValidationConfig):
-            train_val = getattr(self.train, f.name, None)
-            if train_val is not None:
-                warnings.warn(
-                    f"TrainingConfig.{f.name} is deprecated and will be removed in a future release. Use ValidationConfig.{f.name} instead.",
-                    stacklevel=2,
-                )
-                setattr(self.validation, f.name, train_val)
-
-    def _validate_cp_comm_type(self) -> None:
-        """Validate cp_comm_type and hierarchical_context_parallel_sizes consistency."""
-        cp_comm_type = getattr(self.model, "cp_comm_type", None)
-        hcp_sizes = getattr(self.model, "hierarchical_context_parallel_sizes", None)
-        cp_size = getattr(self.model, "context_parallel_size", 1)
-
-        if cp_size > 1 and cp_comm_type is not None:
-            if isinstance(cp_comm_type, list):
-                assert len(cp_comm_type) == self.model.num_layers, (
-                    f"Length of cp_comm_type ({len(cp_comm_type)}) must equal num_layers ({self.model.num_layers})."
-                )
-            else:
-                assert isinstance(cp_comm_type, str), (
-                    f"cp_comm_type must be a str or list of str, got {type(cp_comm_type)}."
-                )
-
-        cp_comm_types = cp_comm_type if isinstance(cp_comm_type, list) else [cp_comm_type or "p2p"]
-        if any("a2a+p2p" in ct for ct in cp_comm_types):
-            assert hcp_sizes is not None, (
-                "hierarchical_context_parallel_sizes must be set when cp_comm_type "
-                "contains 'a2a+p2p'. Without it, CP communication is silently disabled "
-                "and each rank attends only to its local chunk, producing artificially "
-                "high throughput but broken training. Example: for cp=16 across 4 nodes "
-                "of 8 GPUs, set hierarchical_context_parallel_sizes=[8, 2]."
-            )
-
-        if hcp_sizes is not None:
-            from math import prod
-
-            assert prod(hcp_sizes) == cp_size, (
-                f"Product of hierarchical_context_parallel_sizes {hcp_sizes} "
-                f"(={prod(hcp_sizes)}) must equal context_parallel_size (={cp_size})."
-            )
-
-    def _validate_training_scheduler_compatibility(self) -> None:
-        """Cross-validation between training and scheduler configs."""
-        has_train_samples = self.train.train_samples is not None
-
-        if has_train_samples:
-            # Sample-based training validation
-            assert self.scheduler.lr_decay_iters is None, (
-                "Use lr_decay_samples for sample-based training, not lr_decay_iters"
-            )
-            assert self.scheduler.lr_warmup_iters == 0, (
-                "Use lr_warmup_samples for sample-based training, not lr_warmup_iters"
-            )
-            assert not (self.scheduler.lr_warmup_fraction is not None and self.scheduler.lr_warmup_samples != 0), (
-                "Can only specify one of lr_warmup_fraction or lr_warmup_samples"
-            )
-        else:
-            # Iteration-based training validation
-            assert self.scheduler.lr_decay_samples is None, (
-                "Use lr_decay_iters for iteration-based training, not lr_decay_samples"
-            )
-            assert self.scheduler.lr_warmup_samples == 0, (
-                "Use lr_warmup_iters for iteration-based training, not lr_warmup_samples"
-            )
-            assert not (self.scheduler.lr_warmup_fraction is not None and self.scheduler.lr_warmup_iters != 0), (
-                "Can only specify one of lr_warmup_fraction or lr_warmup_iters"
-            )
-
-    def _calculate_scheduler_steps(self) -> None:
-        """Calculate scheduler steps for both iteration-based and sample-based training."""
-        is_sample_based = self.train.train_samples is not None
-
-        if is_sample_based:
-            if self.scheduler.lr_decay_samples is None:
-                self.scheduler.lr_decay_samples = self.train.train_samples
-            self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples
-            self.scheduler.wd_incr_steps = self.train.train_samples
-
-            if self.scheduler.lr_wsd_decay_samples is not None:
-                self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_samples
-
-            # Warmup calculation for sample-based training
-            if self.scheduler.lr_warmup_fraction is not None:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_fraction * self.scheduler.lr_decay_steps
-            else:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_samples
-        else:
-            # Iteration-based training
-            if self.scheduler.lr_decay_iters is None:
-                self.scheduler.lr_decay_iters = self.train.train_iters
-            if self.scheduler.lr_wsd_decay_iters is None and self.scheduler.lr_decay_style == "WSD":
-                self.scheduler.lr_wsd_decay_iters = self.scheduler.lr_decay_iters
-            self.scheduler.lr_decay_steps = self.scheduler.lr_decay_iters * self.train.global_batch_size
-            self.scheduler.wd_incr_steps = self.train.train_iters * self.train.global_batch_size
-
-            if self.scheduler.lr_wsd_decay_iters is not None:
-                self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_iters * self.train.global_batch_size
-
-            if self.scheduler.lr_warmup_fraction is not None:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_fraction * self.scheduler.lr_decay_steps
-            else:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_iters * self.train.global_batch_size
-
-        # Enforce the Megatron Core invariant: lr_warmup_steps must be < lr_decay_steps.
-        # This can be violated when train_iters is small (e.g. smoke runs) while
-        # lr_warmup_iters is tuned for a full-length training run.
-        if self.scheduler.lr_decay_steps <= 0:
-            raise ValueError(
-                f"lr_decay_steps must be > 0, got {self.scheduler.lr_decay_steps}. "
-                "Please increase train_iters/train_samples or lr_decay_iters/lr_decay_samples."
-            )
-        if self.scheduler.lr_warmup_steps >= self.scheduler.lr_decay_steps:
-            capped = self.scheduler.lr_decay_steps - 1
-            warnings.warn(
-                f"lr_warmup_steps ({self.scheduler.lr_warmup_steps}) >= lr_decay_steps "
-                f"({self.scheduler.lr_decay_steps}); capping lr_warmup_steps to {capped}. "
-                "Reduce lr_warmup_iters (or lr_warmup_samples) for short training runs.",
-                UserWarning,
-                stacklevel=2,
-            )
-            self.scheduler.lr_warmup_steps = capped
-
-    def log_non_default_values(self) -> None:
-        """Log configuration values that differ from Megatron Core defaults.
-
-        For configs that inherit from Megatron Core (e.g., OptimizerConfig, DDPConfig,
-        TransformerConfig), this method logs only the values that differ from the Mcore
-        defaults. This makes it easier to spot unintended deviations from baseline settings.
-
-        For configs that don't inherit from Mcore, key values are logged via
-        `_get_key_config_values`, which excludes None values and callables.
-        """
-        if isinstance(self.model, (GPTModelConfig, MambaModelConfig)):
-            transformer_cfg = self.model.transformer
-        else:
-            transformer_cfg = self.model
-        # Determine the correct Mcore parent class for the model config
-        # Some models (e.g., DeepSeek) use MLATransformerConfig instead of TransformerConfig
-        model_mcore_class = _get_mcore_transformer_parent(transformer_cfg)
-
-        # Map of config names to their (config object, Mcore parent class or None)
-        mcore_configs = [
-            ("optimizer", self.optimizer, MCoreOptimizerConfig),
-            ("ddp", self.ddp, MCoreDistributedDataParallelConfig),
-            ("model", transformer_cfg, model_mcore_class),
-        ]
-
-        # Non-Mcore configs - log all values
-        non_mcore_configs = [
-            ("train", self.train),
-            ("validation", self.validation),
-            ("scheduler", self.scheduler),
-            ("dataset", self.dataset),
-            ("checkpoint", self.checkpoint),
-            ("logger", self.logger),
-            ("tokenizer", self.tokenizer),
-            ("rng", self.rng),
-        ]
-
-        log_lines = [""]
-        log_lines.append("=" * 70)
-        log_lines.append("Configuration Summary (Non-Default Values vs Megatron Core)")
-        log_lines.append("=" * 70)
-
-        # Log non-default values for Mcore configs
-        for config_name, config_obj, mcore_class in mcore_configs:
-            non_defaults = _get_non_default_values(config_obj, mcore_class)
-            if non_defaults:
-                log_lines.append(f"\n[{config_name}] Non-default values (vs Mcore {mcore_class.__name__}):")
-                for field_name, (current_val, default_val) in sorted(non_defaults.items()):
-                    log_lines.append(f"  {field_name}: {current_val!r}  (Mcore default: {default_val!r})")
-
-        # Log key values for non-Mcore configs
-        log_lines.append("\n" + "-" * 70)
-        log_lines.append("Other Configuration Values:")
-        log_lines.append("-" * 70)
-
-        for config_name, config_obj in non_mcore_configs:
-            if config_obj is None:
-                continue
-            key_values = _get_key_config_values(config_obj)
-            if key_values:
-                log_lines.append(f"\n[{config_name}]:")
-                for field_name, value in sorted(key_values.items()):
-                    log_lines.append(f"  {field_name}: {value!r}")
-
-        log_lines.append("\n" + "=" * 70)
-
-        print_rank_0("\n".join(log_lines))
-
-
-def _get_mcore_transformer_parent(model_config: Any) -> type:
-    """Determine the correct Mcore TransformerConfig parent class for a model.
-
-    Some models (e.g., DeepSeek v2/v3) inherit from MLATransformerConfig instead of
-    the base TransformerConfig. This function checks the inheritance chain to find
-    the appropriate Mcore class to use as the baseline for comparison.
-
-    Args:
-        model_config: The model configuration object.
-
-    Returns:
-        The appropriate Mcore TransformerConfig class (MCoreMLATransformerConfig or
-        MCoreTransformerConfig).
-    """
-    # Check if the model inherits from MLATransformerConfig
-    if isinstance(model_config, MCoreMLATransformerConfig):
-        return MCoreMLATransformerConfig
-    return MCoreTransformerConfig
-
-
-def _get_non_default_values(config_obj: Any, mcore_class: type) -> Dict[str, Tuple[Any, Any]]:
-    """Get values that differ from Mcore parent class defaults.
-
-    Args:
-        config_obj: The config object to compare.
-        mcore_class: The Megatron Core parent class to compare against.
-
-    Returns:
-        Dictionary mapping field name to (current_value, default_value) for non-default fields.
-    """
-    non_defaults = {}
-
-    # Get default values from Mcore class
-    mcore_defaults = {}
-    for f in fields(mcore_class):
-        if f.name.startswith("_"):
-            continue
-        if f.default is not MISSING:
-            mcore_defaults[f.name] = f.default
-        elif f.default_factory is not MISSING:
-            mcore_defaults[f.name] = f.default_factory()
-
-    # Compare current values against Mcore defaults
-    for f in fields(config_obj):
-        if f.name.startswith("_"):
-            continue
-        field_name = f.name
-        current_value = getattr(config_obj, field_name, None)
-
-        if field_name in mcore_defaults:
-            default_value = mcore_defaults[field_name]
-            # Skip callable values (like functions) and complex objects
-            if callable(current_value) or callable(default_value):
-                continue
-            # Compare values
-            try:
-                if current_value != default_value:
-                    non_defaults[field_name] = (current_value, default_value)
-            except (TypeError, ValueError):
-                # Some types may not be directly comparable (e.g., torch.dtype)
-                if str(current_value) != str(default_value):
-                    non_defaults[field_name] = (current_value, default_value)
-
-    return non_defaults
-
-
-def _get_key_config_values(config_obj: Any) -> Dict[str, Any]:
-    """Get key configuration values for non-Mcore configs.
-
-    Args:
-        config_obj: The config object to extract values from.
-
-    Returns:
-        Dictionary mapping field name to value for key fields.
-    """
-    values = {}
-    if not hasattr(config_obj, "__dataclass_fields__"):
-        return values
-
-    for f in fields(config_obj):
-        if f.name.startswith("_"):
-            continue
-        value = getattr(config_obj, f.name, None)
-        # Skip None values and complex objects
-        if value is None:
-            continue
-        if callable(value):
-            continue
-        values[f.name] = value
-
-    return values
-
-
-def runtime_config_update(cfg: ConfigContainer) -> None:
-    """Apply runtime configuration updates prior to initialization.
-
-    This function handles all configuration modifications that need to happen
-    after initial config creation but before final validation and model setup.
-
-    Steps:
-    1. Resolve mixed precision configuration from string if needed
-    2. Apply mixed precision settings to model, optimizer, and DDP configs
-    3. Calculate data parallel size (needed for comm overlap)
-    4. Apply communication overlap configuration
-    5. Validate configuration after all modifications
-
-    Args:
-        cfg: Configuration container to update
-    """
-    # Apply mixed precision configuration if provided
-    if cfg.mixed_precision is not None:
-        if isinstance(cfg.mixed_precision, str):
-            cfg.mixed_precision = get_mixed_precision_config(cfg.mixed_precision)
-        cfg.mixed_precision.finalize()
-        cfg.mixed_precision.setup(cfg.model, cfg.optimizer, cfg.ddp)
-
-    # Calculate data parallel size (needed for comm overlap methods)
-    cfg.set_data_parallel_size()
-
-    # Apply communication overlap configuration if provided
-    if cfg.comm_overlap is not None:
-        cfg.comm_overlap.finalize()
-        cfg.comm_overlap.setup(cfg.model, cfg.optimizer, cfg.ddp)
-
-    # Validate configuration after all modifications
-    cfg.validate()
-
-
-def mimo_runtime_config_update(cfg: ConfigContainer) -> None:
-    """MIMO-equivalent of ``runtime_config_update``.
-
-    The standard ``runtime_config_update`` cannot be used directly because it
-    accesses ``cfg.model`` attributes (``bf16``, ``tensor_model_parallel_size``,
-    ``cuda_graph_impl``, …) that do not exist on ``MimoModelProvider``.
-
-    This function cherry-picks the safe, model-agnostic parts:
-
-    Keeps (safe for MIMO):
-    - ``data_parallel_size = 1`` (MIMO-specific hard-code)
-    - Sub-config finalization (optimizer, ddp, logger, train, scheduler, checkpoint)
-    - Distributed optimizer sync validation
-    - Deterministic mode validation
-
-    Skips (would crash or is N/A):
-    - Mixed precision resolution (per-module, not container-level)
-    - Communication overlap setup (not supported for MIMO)
-    - Model-level validations (FSDP, CUDA graphs, TE RNG tracker sync, etc.)
-
-    See ``playground/runtime_config_update_analysis.md`` for the full analysis.
-    """
-    # MIMO: data_parallel_size is always 1 from the training loop's perspective.
-    cfg.data_parallel_size = 1
-
-    # Finalize sub-configs that don't depend on model construction order.
-    # NOTE: cfg.model.finalize() is NOT called here — it validates parallelism
-    # config and is called inside setup_mimo() right before build_infra().
-    if hasattr(cfg.optimizer, "finalize"):
-        cfg.optimizer.finalize()
-    if hasattr(cfg.ddp, "finalize"):
-        cfg.ddp.finalize()
-    cfg.logger.finalize()
-    cfg.train.finalize()
-    cfg.scheduler.finalize()
-    cfg.checkpoint.finalize()
-
-    # Safe validations
-    _validate_and_sync_distributed_optimizer_settings(cfg)
-    cfg._validate_and_apply_deterministic_mode()
-
-
-def _validate_and_sync_distributed_optimizer_settings(config: ConfigContainer) -> None:
-    """Validate and synchronize distributed optimizer settings between DDP and optimizer configs.
-
-    This function ensures that distributed optimizer settings are consistent across
-    DDP and optimizer configurations. If either setting is enabled, both will be
-    enabled to maintain consistency.
-
-    Args:
-        config: The configuration container to validate and potentially modify.
-    """
-    ddp_setting = config.ddp.use_distributed_optimizer
-    optimizer_setting = config.optimizer.use_distributed_optimizer
-
-    if ddp_setting or optimizer_setting:
-        if ddp_setting != optimizer_setting:
-            warn_rank_0(
-                f"Distributed optimizer settings were not in sync: "
-                f"ddp.use_distributed_optimizer={ddp_setting}, "
-                f"optimizer.use_distributed_optimizer={optimizer_setting}. "
-                f"Automatically enabling distributed optimizer for both settings."
-            )
-        config.ddp.use_distributed_optimizer = True
-        config.optimizer.use_distributed_optimizer = True
-
-
-def _validate_mixed_precision_consistency(config: ConfigContainer) -> None:
-    """Validate that mixed precision settings are consistent between model and optimizer configs.
-
-    Args:
-        config: The configuration container to validate.
-
-    Raises:
-        AssertionError: If precision settings are inconsistent in a way that would
-            indicate ambiguous behavior.
-    """
-    model_cfg = config.model
-    optimizer_cfg = config.optimizer
-
-    # Mutually exclusive: cannot have both bf16 and fp16 enabled
-    assert not (model_cfg.bf16 and model_cfg.fp16), (
-        "Model config cannot have both bf16=True and fp16=True. Please set only one precision mode."
-    )
-    assert not (optimizer_cfg.bf16 and optimizer_cfg.fp16), (
-        "Optimizer config cannot have both bf16=True and fp16=True. Please set only one precision mode."
-    )
-
-    # Validate across model and optimizer configs
-    if optimizer_cfg.use_precision_aware_optimizer:
-        # For bf16 training: optimizer.bf16 must match model.bf16
-        if model_cfg.bf16:
-            assert optimizer_cfg.bf16, (
-                "optimizer.bf16=True must be set when model.bf16=True and use_precision_aware_optimizer=True."
-            )
-        # For fp16 training: optimizer.fp16 must match model.fp16
-        if model_cfg.fp16:
-            assert optimizer_cfg.fp16, (
-                "optimizer.fp16=True must be set when model.fp16=True and use_precision_aware_optimizer=True."
-            )
-        # For fp32 training (neither bf16 nor fp16 on model)
-        if not model_cfg.bf16 and not model_cfg.fp16:
-            assert not optimizer_cfg.bf16 and not optimizer_cfg.fp16, (
-                "optimizer.bf16 and optimizer.fp16 must both be False when "
-                "model is using fp32 precision (model.bf16=False, model.fp16=False) and "
-                "use_precision_aware_optimizer=True."
-            )
-
-
-def _validate_fine_grained_activation_offloading(config: ConfigContainer) -> None:
-    """Validate fine-grained activation offloading configuration.
-
-    This function ensures that fine-grained activation offloading is only enabled
-    with compatible configurations (transformer_engine implementation) and that
-    necessary environment variables are set for newer TE versions.
-
-    Args:
-        config: The configuration container to validate.
-
-    Raises:
-        ValueError: If fine-grained activation offloading is enabled with incompatible settings.
-    """
-    from megatron.core.utils import is_te_min_version
-
-    model_cfg = config.model
-
-    if not model_cfg.fine_grained_activation_offloading:
-        return
-
-    # Fine-grained activation offloading requires transformer_engine implementation
-    if model_cfg.transformer_impl != "transformer_engine":
-        raise ValueError(
-            "Fine-grained activation offloading is only supported with transformer_engine implementation. "
-            f"Current transformer_impl: {model_cfg.transformer_impl}"
-        )
-
-    # For TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 must be set to avoid offloading weights
-    if is_te_min_version("2.10.0"):
-        if os.getenv("NVTE_CPU_OFFLOAD_V1", "0") != "1":
-            raise ValueError(
-                "For fine-grained activation offloading with TE >= 2.10.0, "
-                "NVTE_CPU_OFFLOAD_V1 environment variable should be set to 1 to avoid offloading weights."
-            )
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/comm_overlap.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from dataclasses import asdict, dataclass, fields
-from typing import Optional
-
-from megatron.core.distributed import DistributedDataParallelConfig
-from megatron.core.optimizer import OptimizerConfig
-from megatron.core.transformer.enums import CudaGraphScope
-from megatron.core.utils import get_te_version, is_te_min_version, is_torch_min_version
-
-from megatron.bridge.models import GPTModelProvider, T5ModelProvider
-from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig
-from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig
-
-
-try:
-    import transformer_engine  # type: ignore  # noqa: F401
-
-    HAVE_TE = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_TE = False
-
-
-@dataclass
-class TPOverlapCfg:
-    """Dataclass for linear layer TP overlap config."""
-
-    pass
-
-
-@dataclass
-class PipelineOverlapCfg(TPOverlapCfg):
-    """Dataclass for pipeline TP overlap config."""
-
-    num_sm: int
-    cga_size: int
-    num_splits: int
-    set_sm_margin: bool
-    fp8_buf: bool = (False,)
-    atomic_gemm: bool = False
-    method: str = "pipeline"
-
-
-@dataclass
-class RingExchangeOverlapCfg(TPOverlapCfg):
-    """Dataclass for ring exchange TP overlap config."""
-
-    aggregate: bool = False
-    method: str = "ring_exchange"
-    num_sm: int = 1
-    cga_size: int = 1
-    set_sm_margin: bool = False
-    fp8_buf: bool = False
-    atomic_gemm: bool = False
-
-
-@dataclass
-class BulkOverlapCfg(TPOverlapCfg):
-    """Dataclass for bulk TP overlap config."""
-
-    num_sm: int
-    cga_size: int
-    set_sm_margin: bool
-    method: str = "bulk"
-
-
-@dataclass
-class TransformerLayerTPOverlapCfg:
-    """Dataclass for transformer layer TP overlap config."""
-
-    qkv_dgrad: TPOverlapCfg
-    qkv_wgrad: TPOverlapCfg
-    fc1_dgrad: TPOverlapCfg
-    fc1_wgrad: TPOverlapCfg
-    qkv_fprop: TPOverlapCfg
-    proj_dgrad: TPOverlapCfg
-    fc1_fprop: TPOverlapCfg
-    fc2_dgrad: TPOverlapCfg
-    proj_fprop: TPOverlapCfg
-    fc2_fprop: TPOverlapCfg
-
-
-# TODO: Add more configs and create a getter function for expose a single api
-# Model configs: H100/70B/TP8/MBS1/SeqLen8K
-userbuffers_bf16_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True),
-)
-
-userbuffers_fp8_h100_h8192_tp4_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-userbuffers_bf16_b200_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=4, set_sm_margin=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True),
-)
-
-userbuffers_fp8_b200_h8192_tp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-# llama3.1 405b
-userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=8, cga_size=2, num_splits=4, set_sm_margin=True),
-)
-
-userbuffers_fp8_h100_h16384_tp8_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-userbuffers_bf16_b200_h16384_tp4_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=24, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=4, set_sm_margin=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=8, cga_size=2, num_splits=4, set_sm_margin=True),
-)
-
-userbuffers_fp8_b200_h16384_tp4_cp2_mbs1_seqlen8192 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-# llama3 70b LoRA
-userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=None,
-    fc1_dgrad=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
-    fc1_wgrad=None,
-    qkv_fprop=RingExchangeOverlapCfg(set_sm_margin=True),
-    proj_dgrad=RingExchangeOverlapCfg(set_sm_margin=True),
-    fc1_fprop=RingExchangeOverlapCfg(set_sm_margin=True),
-    fc2_dgrad=RingExchangeOverlapCfg(set_sm_margin=True),
-    proj_fprop=RingExchangeOverlapCfg(cga_size=2, set_sm_margin=True, fp8_buf=True),
-    fc2_fprop=RingExchangeOverlapCfg(cga_size=2, set_sm_margin=True, fp8_buf=True),
-)
-
-# llama3.1 405b LoRA
-userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=None,
-    fc1_dgrad=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
-    fc1_wgrad=None,
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=True),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=True),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=True),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=True),
-    proj_fprop=PipelineOverlapCfg(num_sm=32, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
-)
-
-# GPT3 20b
-userbuffers_bf16_h100_h6144_tp2_mbs2_seqlen2048 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
-)
-
-userbuffers_fp8_h100_h6144_tp2_mbs2_seqlen2048 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=16, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True, fp8_buf=True),
-)
-
-# GPT3 175b
-userbuffers_bf16_h100_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-userbuffers_fp8_h100_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-userbuffers_bf16_b200_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-userbuffers_fp8_b200_h12288_tp4_mbs1_seqlen2048 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=4, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=16, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-# Nemotron 15B
-userbuffers_bf16_b200_h6144_tp2_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-# Nemotron 340B
-userbuffers_bf16_b200_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=PipelineOverlapCfg(num_sm=24, cga_size=2, num_splits=4, set_sm_margin=True),
-)
-
-userbuffers_fp8_b200_h18432_tp8_mbs1_seqlen4096 = TransformerLayerTPOverlapCfg(
-    qkv_dgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_wgrad=BulkOverlapCfg(num_sm=32, cga_size=2, set_sm_margin=False),
-    fc1_dgrad=BulkOverlapCfg(num_sm=2, cga_size=2, set_sm_margin=False),
-    fc1_wgrad=BulkOverlapCfg(num_sm=8, cga_size=2, set_sm_margin=False),
-    qkv_fprop=RingExchangeOverlapCfg(aggregate=False),
-    proj_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    fc1_fprop=RingExchangeOverlapCfg(aggregate=False),
-    fc2_dgrad=RingExchangeOverlapCfg(aggregate=False),
-    proj_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-    fc2_fprop=RingExchangeOverlapCfg(num_sm=1, set_sm_margin=True),
-)
-
-
-@dataclass
-class _CommOverlapConfig:
-    # Tensor parallel communication overlap (experimental)
-    tp_comm_overlap: bool = None
-    tp_comm_overlap_cfg: dict = None
-    tp_comm_bootstrap_backend: str = None
-    # Pipeline parallel communication overlap
-    overlap_p2p_comm: bool = None
-    batch_p2p_comm: bool = None
-    # Data parallel communication overlap
-    overlap_grad_reduce: bool = None
-    overlap_param_gather: bool = None
-    overlap_param_gather_with_optimizer_step: bool = None
-    align_param_gather: bool = None
-    bucket_size: int = None
-    # Pipeline bubble overlap
-    defer_embedding_wgrad_compute: bool = None
-    wgrad_deferral_limit: int = None
-    # MOE expert parallel comm
-    overlap_moe_expert_parallel_comm: bool = None
-    delay_wgrad_compute: bool = None
-
-
-@dataclass(kw_only=True)
-class CommOverlapConfig:
-    """Configuration for communication overlap optimizations in distributed training.
-
-    This class manages tensor parallel, pipeline parallel, and data parallel
-    communication overlap settings to improve training performance.
-    """
-
-    tp_comm_overlap: bool
-    tp_comm_overlap_cfg: Optional[TransformerLayerTPOverlapCfg] = None
-    tp_comm_bootstrap_backend: Optional[str] = "nccl"
-    overlap_p2p_comm: Optional[bool] = None
-    batch_p2p_comm: Optional[bool] = None
-    overlap_grad_reduce: Optional[bool] = None
-    overlap_param_gather: Optional[bool] = None
-    overlap_param_gather_with_optimizer_step: Optional[bool] = None
-    align_param_gather: Optional[bool] = None
-    bucket_size: Optional[int] = None
-    defer_embedding_wgrad_compute: Optional[bool] = None
-    wgrad_deferral_limit: Optional[int] = None
-    data_parallel_size: Optional[int] = None
-    overlap_moe_expert_parallel_comm: Optional[bool] = None
-    delay_wgrad_compute: Optional[bool] = None
-
-    def finalize(self):
-        # Don't recreate the user_comm_overlap_cfg if the post init is re-run
-        if hasattr(self, "user_comm_overlap_cfg") and self.user_comm_overlap_cfg is not None:
-            return
-
-        self.user_comm_overlap_cfg = _CommOverlapConfig(
-            tp_comm_overlap=self.tp_comm_overlap,
-            tp_comm_overlap_cfg=self.tp_comm_overlap_cfg,
-            tp_comm_bootstrap_backend=self.tp_comm_bootstrap_backend,
-            overlap_p2p_comm=self.overlap_p2p_comm,
-            batch_p2p_comm=self.batch_p2p_comm,
-            overlap_grad_reduce=self.overlap_grad_reduce,
-            overlap_param_gather=self.overlap_param_gather,
-            overlap_param_gather_with_optimizer_step=self.overlap_param_gather_with_optimizer_step,
-            align_param_gather=self.align_param_gather,
-            bucket_size=self.bucket_size,
-            defer_embedding_wgrad_compute=self.defer_embedding_wgrad_compute,
-            wgrad_deferral_limit=self.wgrad_deferral_limit,
-            overlap_moe_expert_parallel_comm=self.overlap_moe_expert_parallel_comm,
-            delay_wgrad_compute=self.delay_wgrad_compute,
-        )
-
-    def _get_model_comm_overlap_cfgs(
-        self,
-        model_cfg: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig,
-        ddp_config: DistributedDataParallelConfig,
-    ) -> _CommOverlapConfig:
-        comm_overlap_cfg = _CommOverlapConfig()
-
-        vp_size = model_cfg.virtual_pipeline_model_parallel_size
-        if vp_size is None:
-            vp_size = 1
-
-        # Optimizations disabled by default, can be overriden by user
-        comm_overlap_cfg.tp_comm_overlap = False
-        comm_overlap_cfg.tp_comm_overlap_cfg = None
-        comm_overlap_cfg.defer_embedding_wgrad_compute = False
-        comm_overlap_cfg.wgrad_deferral_limit = -1
-        comm_overlap_cfg.overlap_moe_expert_parallel_comm = False
-        comm_overlap_cfg.delay_wgrad_compute = False
-
-        # Check if TP overlap can be safely enabled
-        if self.user_comm_overlap_cfg.tp_comm_overlap is True:
-            if model_cfg.tensor_model_parallel_size < 2:
-                logging.warning("Disabling tensor parallel communication overlap due to TP size < 2.")
-                self.user_comm_overlap_cfg.tp_comm_overlap = False
-            elif not model_cfg.sequence_parallel:
-                logging.warning("Disabling tensor parallel communication overlap due to sequence_parallel=False.")
-                self.user_comm_overlap_cfg.tp_comm_overlap = False
-            elif not HAVE_TE:
-                logging.warning("Disabling tensor parallel communication overlap due to TE not detected.")
-                self.user_comm_overlap_cfg.tp_comm_overlap = False
-
-        # PP overlap
-        if model_cfg.pipeline_model_parallel_size > 1:
-            if vp_size > 1:
-                comm_overlap_cfg.overlap_p2p_comm = True
-                comm_overlap_cfg.batch_p2p_comm = False
-            else:
-                comm_overlap_cfg.overlap_p2p_comm = False
-                comm_overlap_cfg.batch_p2p_comm = True
-        else:
-            comm_overlap_cfg.overlap_p2p_comm = False
-            comm_overlap_cfg.batch_p2p_comm = False
-
-        # MOE expert parallel comm overlap
-        assert hasattr(model_cfg, "overlap_moe_expert_parallel_comm"), (
-            f"model_cfg: {model_cfg} does not have overlap_moe_expert_parallel_comm"
-        )
-
-        if self.user_comm_overlap_cfg.overlap_moe_expert_parallel_comm is True:
-            assert model_cfg.expert_model_parallel_size > 1, (
-                "overlap_moe_expert_parallel_comm is only supported when expert_model_parallel_size > 1"
-            )
-            assert model_cfg.num_moe_experts > 1, (
-                f"overlap_moe_expert_parallel_comm is only supported when num_moe_experts > 1, \
-                    but got {model_cfg.num_moe_experts}"
-            )
-            assert model_cfg.moe_token_dispatcher_type in ["alltoall", "flex"], (
-                f"overlap_moe_expert_parallel_comm is only supported when moe_token_dispatcher_type == 'alltoall' or 'flex',\
-                      but got {model_cfg.moe_token_dispatcher_type}"
-            )
-            assert model_cfg.bf16 or model_cfg.fp16, (
-                "overlap_moe_expert_parallel_comm is only supported when using bf16 or fp16 models"
-            )
-            assert is_torch_min_version("2.6.0"), "A2A Overlap encounters hang issue with torch version < 2.6.0"
-            if model_cfg.pipeline_model_parallel_size > 1:
-                assert model_cfg.virtual_pipeline_model_parallel_size is not None, (
-                    "If enabling EP A2A overlap, virtual_pipeline_model_parallel_size "
-                    "must be specified when pipeline_model_parallel_size > 1"
-                )
-            assert model_cfg.recompute_granularity != "full", (
-                "disable full recomputation when enabling overlap_moe_expert_parallel_comm"
-            )
-            assert model_cfg.recompute_method is None, (
-                "disable recomputation method when enabling overlap_moe_expert_parallel_comm"
-            )
-            assert model_cfg.recompute_num_layers is None, (
-                "recompute_num_layers must be None when enabling overlap_moe_expert_parallel_comm"
-            )
-            assert not model_cfg.moe_shared_expert_overlap, (
-                "disable moe_shared_expert_overlap when enabling overlap_moe_expert_parallel_comm"
-            )
-            assert model_cfg.mtp_num_layers is None or model_cfg.mtp_num_layers == 1, (
-                "MTP layernum only supports 1 when enabling overlap_moe_expert_parallel_comm."
-            )
-
-        if self.user_comm_overlap_cfg.delay_wgrad_compute is True:
-            if ddp_config.overlap_grad_reduce or self.user_comm_overlap_cfg.overlap_grad_reduce:
-                assert is_te_min_version("2.7.0"), (
-                    f"TE version >= 2.7.0 is required for overlap_grad_reduce when using"
-                    f"delay_wgrad_compute. Current TE version: {get_te_version()}"
-                )
-            if model_cfg.gradient_accumulation_fusion is True:
-                assert is_te_min_version("2.7.0"), (
-                    f"TE version >= 2.7.0 is required for gradient_accumulation_fusion when using"
-                    f"delay_wgrad_compute. Current TE version: {get_te_version()}"
-                )
-
-            assert (
-                model_cfg.overlap_moe_expert_parallel_comm
-                or self.user_comm_overlap_cfg.overlap_moe_expert_parallel_comm
-            ), "overlap_moe_expert_parallel_comm is required for delay_wgrad_compute"
-
-            # CUDA graph scope-specific validations for delayed wgrad.
-            cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", []) or []
-            if isinstance(cuda_graph_scope, str):
-                cuda_graph_scope = cuda_graph_scope.split(",") if cuda_graph_scope else []
-            elif not isinstance(cuda_graph_scope, list):
-                cuda_graph_scope = [cuda_graph_scope]
-            attn_scope_enabled = (
-                CudaGraphScope.attn in cuda_graph_scope
-                or CudaGraphScope.attn.value in cuda_graph_scope
-                or f"CudaGraphScope.{CudaGraphScope.attn.value}" in cuda_graph_scope
-            )
-            moe_router_scope_enabled = (
-                CudaGraphScope.moe_router in cuda_graph_scope
-                or CudaGraphScope.moe_router.value in cuda_graph_scope
-                or f"CudaGraphScope.{CudaGraphScope.moe_router.value}" in cuda_graph_scope
-            )
-            wgrad_in_graph_scope = attn_scope_enabled or (
-                moe_router_scope_enabled
-                and getattr(model_cfg, "moe_shared_expert_intermediate_size", None) is not None
-                and not getattr(model_cfg, "moe_shared_expert_overlap", False)
-            )
-            if wgrad_in_graph_scope:
-                assert is_te_min_version("2.12.0"), (
-                    "CUDA graph with delay_wgrad_compute requires TE version >= 2.12.0."
-                )
-                assert model_cfg.gradient_accumulation_fusion, (
-                    "CUDA graph with delay_wgrad_compute requires gradient_accumulation_fusion "
-                    "to be enabled. This is because default gradient accumulation does not use "
-                    "static memory addresses, which breaks CUDA graph requirements."
-                )
-                if attn_scope_enabled:
-                    assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, (
-                        "CUDA graph with delay_wgrad_compute does not support attention bias for now."
-                    )
-
-            # CUDA graph scope-specific validations for delayed wgrad.
-            cuda_graph_scope = getattr(model_cfg, "cuda_graph_scope", None)
-            if cuda_graph_scope is None or cuda_graph_scope == "full":
-                cuda_graph_scope = []
-            elif isinstance(cuda_graph_scope, (str, CudaGraphScope)):
-                cuda_graph_scope = [cuda_graph_scope]
-            attn_scope_enabled = (
-                CudaGraphScope.attn in cuda_graph_scope
-                or CudaGraphScope.attn.value in cuda_graph_scope
-                or f"CudaGraphScope.{CudaGraphScope.attn.value}" in cuda_graph_scope
-            )
-            moe_router_scope_enabled = (
-                CudaGraphScope.moe_router in cuda_graph_scope
-                or CudaGraphScope.moe_router.value in cuda_graph_scope
-                or f"CudaGraphScope.{CudaGraphScope.moe_router.value}" in cuda_graph_scope
-            )
-            wgrad_in_graph_scope = attn_scope_enabled or (
-                moe_router_scope_enabled
-                and getattr(model_cfg, "moe_shared_expert_intermediate_size", None) is not None
-                and not getattr(model_cfg, "moe_shared_expert_overlap", False)
-            )
-            if wgrad_in_graph_scope:
-                assert is_te_min_version("2.12.0"), (
-                    "CUDA graph with delay_wgrad_compute requires TE version >= 2.12.0."
-                )
-                assert model_cfg.gradient_accumulation_fusion, (
-                    "CUDA graph with delay_wgrad_compute requires gradient_accumulation_fusion "
-                    "to be enabled. This is because default gradient accumulation does not use "
-                    "static memory addresses, which breaks CUDA graph requirements."
-                )
-                if attn_scope_enabled:
-                    assert not model_cfg.add_bias_linear and not model_cfg.add_qkv_bias, (
-                        "CUDA graph with delay_wgrad_compute does not support attention bias for now."
-                    )
-
-        comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg)
-        return comm_overlap_cfg
-
-    def _get_optimizer_overlap_cfgs(
-        self, model_cfg: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig
-    ) -> _CommOverlapConfig:
-        vp_size = model_cfg.virtual_pipeline_model_parallel_size
-        if vp_size is None:
-            vp_size = 1
-
-        comm_overlap_cfg = _CommOverlapConfig()
-        comm_overlap_cfg.bucket_size = None
-        comm_overlap_cfg.overlap_grad_reduce = False
-        comm_overlap_cfg.overlap_param_gather = False
-        comm_overlap_cfg.overlap_param_gather_with_optimizer_step = False
-        comm_overlap_cfg.align_param_gather = False
-
-        if self.data_parallel_size > 1:
-            comm_overlap_cfg.bucket_size = 128 * 1024 * 1024
-            comm_overlap_cfg.overlap_grad_reduce = True
-            comm_overlap_cfg.overlap_param_gather = True
-            if model_cfg.pipeline_model_parallel_size > 1 and vp_size > 1:
-                # Currently disabled due to an issue with checkpointing
-                # comm_overlap_cfg.overlap_param_gather_with_optimizer_step = True
-                comm_overlap_cfg.align_param_gather = True
-
-        comm_overlap_cfg = self._override_user_cfgs(comm_overlap_cfg)
-        return comm_overlap_cfg
-
-    def _apply_cfgs(self, src_cfg, dest_cfg):
-        # apply optimizations into dest_cfg
-        for field in fields(src_cfg):
-            if hasattr(dest_cfg, field.name):
-                setattr(dest_cfg, field.name, getattr(src_cfg, field.name))
-
-    def _override_user_cfgs(self, comm_overlap_cfg):
-        # override default configs with any user provided configs
-        if isinstance(self.user_comm_overlap_cfg, _CommOverlapConfig):
-            for field in fields(self.user_comm_overlap_cfg):
-                user_value = getattr(self.user_comm_overlap_cfg, field.name)
-                if user_value is not None:
-                    setattr(comm_overlap_cfg, field.name, user_value)
-
-        return comm_overlap_cfg
-
-    def setup(
-        self,
-        model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig,
-        optimizer_config: OptimizerConfig,
-        ddp_config: DistributedDataParallelConfig,
-    ) -> None:
-        """Set up communication overlap configurations for the model, optimizer, and DDP.
-
-        Args:
-            model_config: Model configuration containing parallelism settings
-            optimizer_config: Optimizer configuration for gradient overlap settings
-            ddp_config: Distributed data parallel configuration
-        """
-        comm_overlap_cfg = self._get_model_comm_overlap_cfgs(model_config, ddp_config)
-        self._apply_cfgs(comm_overlap_cfg, model_config)
-        if model_config.tp_comm_overlap:
-            if comm_overlap_cfg.tp_comm_overlap_cfg is None:
-                logging.warning(
-                    "Tensor parallel overlap: No overlap config provided. "
-                    "Initializing TP comm overlap with the default config."
-                )
-                model_config.tp_comm_overlap_cfg = None
-            else:
-                # ub_cfgs is a dataclass, however TE needs a dict, so convert here
-                model_config.tp_comm_overlap_cfg = asdict(comm_overlap_cfg.tp_comm_overlap_cfg)
-                # remove keys with None values from dictionary to match TE's expectations
-                model_config.tp_comm_overlap_cfg = {
-                    key: value for key, value in model_config.tp_comm_overlap_cfg.items() if value is not None
-                }
-            model_config.tp_comm_bootstrap_backend = comm_overlap_cfg.tp_comm_bootstrap_backend
-
-        # Data parallel overlap is only available with the Megatron DDP and Distributed optimizer
-        if (
-            isinstance(optimizer_config, OptimizerConfig)
-            and isinstance(ddp_config, DistributedDataParallelConfig)
-            and ddp_config.use_distributed_optimizer
-        ):
-            comm_overlap_cfg = self._get_optimizer_overlap_cfgs(model_config)
-            self._apply_cfgs(comm_overlap_cfg, optimizer_config)
-            self._apply_cfgs(comm_overlap_cfg, ddp_config)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/initialize.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import datetime
-import os
-import time
-import warnings
-from typing import Callable, Optional
-
-import torch
-import torch.distributed
-import torch.nn.functional as F
-from megatron.core import parallel_state, tensor_parallel
-from megatron.core.datasets.utils import compile_helpers
-from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train
-from megatron.core.fusions.fused_bias_gelu import bias_gelu
-from megatron.core.fusions.fused_bias_swiglu import bias_swiglu
-from megatron.core.hyper_comm_grid import HyperCommGrid
-from megatron.core.num_microbatches_calculator import (
-    destroy_num_microbatches_calculator,
-    init_num_microbatches_calculator,
-)
-from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
-from megatron.core.utils import (
-    configure_nvtx_profiling,
-    get_pg_rank,
-    get_te_version,
-    is_te_min_version,
-    is_torch_min_version,
-)
-
-from megatron.bridge.models import GPTModelProvider, T5ModelProvider
-from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig
-from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig
-from megatron.bridge.models.transformer_config import TransformerConfig
-from megatron.bridge.training.config import ConfigContainer, DistributedInitConfig, RerunStateMachineConfig, RNGConfig
-from megatron.bridge.utils.common_utils import (
-    get_local_rank_preinit,
-    get_master_addr_safe,
-    get_master_port_safe,
-    get_rank_safe,
-    get_world_size_safe,
-)
-
-
-def initialize_megatron(
-    cfg: ConfigContainer,
-    allow_no_cuda: bool = False,
-    skip_mpu_initialization: bool = False,
-    get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None,
-    get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None,
-    restart_store: Optional[torch.distributed.Store] = None,
-) -> Callable[[], None] | ProcessGroupCollection | None:
-    """Initialize Megatron core components and distributed setup.
-
-    Sets up logging, initializes distributed environment (torch.distributed),
-    configures microbatch calculator, and sets random seeds.
-
-    Args:
-        cfg: The main configuration container.
-        allow_no_cuda: If True, allows initialization without CUDA.
-        skip_mpu_initialization: If True, skips MPU initialization (for external managers).
-        get_embedding_ranks: Optional function to determine embedding layer ranks.
-        get_position_embedding_ranks: Optional function to determine position embedding ranks.
-        restart_store: Optional store for in-process restart.
-
-    Returns:
-        An optional callable to finish MPU initialization if lazy_mpu_init is True,
-        otherwise None.
-    """
-
-    if not allow_no_cuda:
-        # Make sure cuda is available.
-        assert torch.cuda.is_available(), "Megatron requires CUDA."
-
-    model_config = cfg.model
-    dist_config = cfg.dist
-    rng_config = cfg.rng
-    rerun_state_machine_config = cfg.rerun_state_machine
-    train_config = cfg.train
-    use_inprocess_restart = cfg.inprocess_restart is not None and cfg.inprocess_restart.enabled
-
-    # Configure NVTX profiling if requested
-    if cfg.profiling is not None and cfg.profiling.nvtx_ranges:
-        configure_nvtx_profiling(enabled=True)
-
-    # Prep for checkpoint conversion.
-    # if args.ckpt_convert_format is not None:
-    #     assert args.ckpt_convert_save is not None
-    #     assert args.load is not None
-    #     args.exit_on_missing_checkpoint = True
-
-    # TODO (maanug): determine if we want to support this behavior
-    # if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
-    #     assert args.load is not None, "--use-checkpoint-args requires --load argument"
-    #     load_args_from_checkpoint(args)
-
-    init_num_microbatches_calculator(
-        get_rank_safe(),
-        train_config.rampup_batch_size,
-        train_config.global_batch_size,
-        train_config.micro_batch_size,
-        cfg.data_parallel_size,
-        train_config.decrease_batch_size_if_needed,
-    )
-
-    # init rerun global state
-    init_rerun_state(rerun_state_machine_config)
-
-    # torch.distributed initialization
-    result = torch_dist_init(
-        model_config=model_config,
-        dist_config=dist_config,
-        rng_config=rng_config,
-        micro_batch_size=train_config.micro_batch_size,
-        num_distributed_optimizer_instances=cfg.ddp.num_distributed_optimizer_instances,
-        get_embedding_ranks=get_embedding_ranks,
-        get_position_embedding_ranks=get_position_embedding_ranks,
-        skip_mpu_initialization=skip_mpu_initialization,
-        restart_store=restart_store,
-        use_inprocess_restart=use_inprocess_restart,
-    )
-
-    # Compile dataset helpers after distributed initialization
-    # Use local rank to ensure each node compiles independently (multi-node without shared filesystem)
-    if torch.distributed.is_initialized():
-        if get_local_rank_preinit() == 0:
-            start_time = time.time()
-            print("> compiling dataset index builder ...")
-            compile_helpers()
-            print(
-                ">>> done with dataset index builder. Compilation time: {:.3f} seconds".format(
-                    time.time() - start_time
-                ),
-                flush=True,
-            )
-        torch.distributed.barrier()
-
-    return result
-
-
-def torch_dist_init(
-    model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig,
-    dist_config: DistributedInitConfig,
-    rng_config: RNGConfig,
-    micro_batch_size: int,
-    num_distributed_optimizer_instances: int,
-    get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]],
-    get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]],
-    skip_mpu_initialization: bool,
-    restart_store: Optional[torch.distributed.Store] = None,
-    use_inprocess_restart: bool = False,
-) -> Callable[[], None] | ProcessGroupCollection | None:
-    """Initialize torch.distributed and dependent components.
-
-    Handles the core distributed setup, including process group initialization,
-    MPU (Model Parallel Unit) setup, random seed setting, and optional
-    compilation/warmup steps.
-
-    Args:
-        model_config: Configuration for the specific model (GPTConfig or T5Config).
-        dist_config: Configuration for distributed initialization settings.
-        rng_config: Configuration for random number generation.
-        micro_batch_size: The micro batch size for JIT warmup.
-        num_distributed_optimizer_instances: Number of parallel optimizer instances.
-        get_embedding_ranks: Optional function to determine embedding layer ranks.
-        get_position_embedding_ranks: Optional function to determine position embedding ranks.
-        skip_mpu_initialization: If True, returns a function to finish MPU setup later.
-
-    Returns:
-        An optional callable to finish MPU initialization if skip_mpu_initialization
-        or lazy_mpu_init is True, otherwise None.
-    """
-
-    def finish_mpu_init() -> ProcessGroupCollection:
-        # Pytorch distributed.
-        pg_collection = _initialize_distributed(
-            model_config=model_config.transformer
-            if isinstance(model_config, (GPTModelConfig, MambaModelConfig))
-            else model_config,
-            dist_config=dist_config,
-            num_distributed_optimizer_instances=num_distributed_optimizer_instances,
-            get_embedding_ranks=get_embedding_ranks,
-            get_position_embedding_ranks=get_position_embedding_ranks,
-            restart_store=restart_store,
-            use_inprocess_restart=use_inprocess_restart,
-        )
-
-        # Random seeds for reproducibility.
-        if get_rank_safe() == 0:
-            print("> setting random seeds to {} ...".format(rng_config.seed))
-        _set_random_seed(
-            rng_config.seed,
-            rng_config.data_parallel_random_init,
-            rng_config.te_rng_tracker,
-            rng_config.inference_rng_tracker,
-            use_cudagraphable_rng=(model_config.cuda_graph_impl != "none"),
-            pg_collection=pg_collection,
-        )
-
-        if model_config.num_moe_experts is not None:
-            MoEAuxLossAutoScaler.set_loss_scale(torch.ones(1, device=torch.cuda.current_device()))
-        return pg_collection
-
-    if skip_mpu_initialization:
-        return None
-
-    if dist_config.lazy_mpu_init:
-        # delayed initialization of DDP-related stuff
-        # We only set basic DDP globals
-        parallel_state.set_tensor_model_parallel_world_size(model_config.tensor_model_parallel_size)
-        # and return function for external DDP manager
-        # to call when it has DDP initialized
-        parallel_state.set_tensor_model_parallel_rank(get_rank_safe())
-        return finish_mpu_init
-    # Megatron's MPU is the master. Complete initialization right away.
-    pg_collection = finish_mpu_init()
-
-    if model_config.tp_comm_overlap:
-        _initialize_tp_communicators(model_config, micro_batch_size)
-
-    return pg_collection
-
-
-def init_rerun_state(rerun_state_machine_config: RerunStateMachineConfig) -> None:
-    """Initialize the rerun state machine for result validation or stats.
-
-    Sets up state saving and restoration functions, particularly for RNG trackers.
-
-    Args:
-        rerun_state_machine_config: Configuration for the rerun state machine.
-    """
-    from megatron.core.rerun_state_machine import (
-        RerunDiagnostic,
-        RerunErrorInjector,
-        RerunMode,
-        get_rerun_state_machine,
-        initialize_rerun_state_machine,
-    )
-
-    def state_save_func():
-        return {"rng_tracker_states": tensor_parallel.get_cuda_rng_tracker().get_states()}
-
-    def state_restore_func(state_dict):
-        if state_dict["rng_tracker_states"]:
-            tensor_parallel.get_cuda_rng_tracker().set_states(state_dict["rng_tracker_states"])
-
-    initialize_rerun_state_machine(
-        state_save_func=state_save_func,
-        state_restore_func=state_restore_func,
-        mode=RerunMode(rerun_state_machine_config.rerun_mode),
-        error_injector=RerunErrorInjector(
-            error_injection_rate=rerun_state_machine_config.error_injection_rate,
-            error_injection_type=RerunDiagnostic(rerun_state_machine_config.error_injection_type),
-        ),
-    )
-
-    # Store config on the singleton for use in loss validation
-    rsm = get_rerun_state_machine()
-    rsm.spiky_loss_factor = rerun_state_machine_config.spiky_loss_factor
-
-
-def set_jit_fusion_options(
-    model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, micro_batch_size: int
-) -> None:
-    """Set PyTorch JIT layer fusion options and warmup JIT functions.
-
-    Configures the JIT fuser (nvFuser or legacy) based on the PyTorch version
-    and warms up common fused kernels like bias_gelu and bias_dropout_add.
-
-    Args:
-        model_config: Configuration for the specific model (GPTConfig or T5Config).
-        micro_batch_size: The micro batch size used for warmup tensor shapes.
-    """
-    # flags required to enable jit fusion kernels
-    if is_torch_min_version("2.2.0a0"):
-        pass  # we're using torch.compile for jit fusion
-    elif is_torch_min_version("1.10.0a0"):
-        # nvfuser
-        torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_override_can_fuse_on_cpu(False)
-        torch._C._jit_override_can_fuse_on_gpu(False)
-        torch._C._jit_set_texpr_fuser_enabled(False)
-        torch._C._jit_set_nvfuser_enabled(True)
-        torch._C._debug_set_autodiff_subgraph_inlining(False)
-    else:
-        # legacy pytorch fuser
-        torch._C._jit_set_profiling_mode(False)
-        torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_override_can_fuse_on_cpu(True)
-        torch._C._jit_override_can_fuse_on_gpu(True)
-
-    _warmup_jit_function(
-        model_config.transformer if isinstance(model_config, (GPTModelConfig, MambaModelConfig)) else model_config,
-        micro_batch_size,
-    )
-
-
-def destroy_global_state() -> None:
-    """Destroy Megatron global states.
-
-    Cleans up resources used by microbatch calculator, global memory buffer,
-    model parallel groups, and the rerun state machine.
-    """
-    from megatron.core.rerun_state_machine import destroy_rerun_state_machine
-
-    destroy_num_microbatches_calculator()
-    parallel_state.destroy_global_memory_buffer()
-    parallel_state.destroy_model_parallel()
-    destroy_rerun_state_machine()
-
-
-def _initialize_tp_communicators(
-    model_config: GPTModelProvider | T5ModelProvider | GPTModelConfig | MambaModelConfig, micro_batch_size: int
-) -> None:
-    """initializing the communicators with user buffers for high-performance tensor-model-parallel
-    communication overlap"""
-
-    try:
-        import transformer_engine  # noqa: F401
-        import yaml
-        from transformer_engine.pytorch import module as te_module
-
-    except ImportError:
-        raise RuntimeError(
-            "Tensor Parallel Communication/GEMM Overlap optimization needs 'yaml' and 'transformer_engine' packages"
-        )
-
-    if model_config.tp_comm_overlap_cfg is not None:
-        if isinstance(model_config.tp_comm_overlap_cfg, str):
-            with open(model_config.tp_comm_overlap_cfg, "r") as stream:
-                ub_cfgs = yaml.safe_load(stream)
-        else:
-            ub_cfgs = model_config.tp_comm_overlap_cfg
-    else:
-        ub_cfgs = {}
-
-    input_shape = [
-        (model_config.seq_length * micro_batch_size) // model_config.context_parallel_size,
-        model_config.hidden_size,
-    ]
-
-    if is_te_min_version("2.7.0"):
-        UserBufferQuantizationMode = te_module.base.UserBufferQuantizationMode
-        quantization_modes = [UserBufferQuantizationMode.FP8 if model_config.fp8 else UserBufferQuantizationMode.NONE]
-        if (
-            model_config.fp8 is not None
-            and model_config.first_last_layers_bf16
-            and (model_config.num_layers_at_start_in_bf16 > 0 or model_config.num_layers_at_end_in_bf16 > 0)
-        ):
-            quantization_modes.append(UserBufferQuantizationMode.NONE)
-        # The process group with the target bootstrap backend is created in Transformer Engine.
-        te_module.base.initialize_ub(
-            shape=input_shape,
-            tp_size=model_config.tensor_model_parallel_size,
-            quantization_modes=quantization_modes,
-            ub_cfgs=ub_cfgs,
-            bootstrap_backend=model_config.tp_comm_bootstrap_backend,
-        )
-    elif is_te_min_version("1.9.0"):
-        # The process group with the target bootstrap backend is created in Transformer Engine.
-        te_module.base.initialize_ub(
-            shape=input_shape,
-            tp_size=model_config.tensor_model_parallel_size,
-            use_fp8=(model_config.fp8 is not None),
-            ub_cfgs=ub_cfgs,
-            bootstrap_backend=model_config.tp_comm_bootstrap_backend,
-        )
-    else:
-        if model_config.tp_comm_bootstrap_backend != "mpi":
-            warnings.warn(f"Transformer Engine v{get_te_version()} supports only MPI bootstrap backend.")
-        # Create a MPI process group to help with TP communication overlap bootstrap.
-        torch.distributed.new_group(backend="mpi")
-
-        te_module.base.initialize_ub(
-            shape=input_shape,
-            tp_size=model_config.tensor_model_parallel_size,
-            use_fp8=(model_config.fp8 is not None),
-            ub_cfgs=ub_cfgs,
-        )
-
-
-def _create_pg_collection(
-    model_config: TransformerConfig,
-    num_distributed_optimizer_instances: int,
-    get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None,
-    get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None,
-) -> ProcessGroupCollection:
-    """Create all process groups via HyperCommGrid and return a ProcessGroupCollection."""
-    hcp_sizes = getattr(model_config, "hierarchical_context_parallel_sizes", None)
-    if hcp_sizes is not None:
-        raise NotImplementedError(
-            "Decentralized process groups (use_decentralized_pg=True) do not support "
-            "hierarchical_context_parallel_sizes. Use cp_comm_type='a2a' or 'p2p' instead, "
-            "or set use_decentralized_pg=False to use the MPU path which supports 'a2a+p2p'."
-        )
-
-    world_size = torch.distributed.get_world_size()
-    tp_size = int(model_config.tensor_model_parallel_size)
-    pp_size = int(model_config.pipeline_model_parallel_size)
-    cp_size = int(model_config.context_parallel_size) if getattr(model_config, "context_parallel_size", 1) else 1
-    model_size = tp_size * pp_size * cp_size
-    if world_size % model_size != 0:
-        raise RuntimeError(f"world_size ({world_size}) is not divisible by {model_size}")
-    dp_size = world_size // model_size
-
-    grid = HyperCommGrid(
-        shape=[tp_size, cp_size, dp_size, pp_size],
-        dim_names=["tp", "cp", "dp", "pp"],
-        rank_offset=0,
-        backend="nccl",
-    )
-    # Core groups
-    tp_pg = grid.create_pg(["tp"])
-    cp_pg = grid.create_pg(["cp"])
-    pp_pg = grid.create_pg(["pp"])
-    dp_pg = grid.create_pg(["dp"])
-    mp_pg = grid.create_pg(["tp", "pp"])
-    tp_cp_pg = grid.create_pg(["tp", "cp"])
-    tp_dp_cp_pg = grid.create_pg(["tp", "dp", "cp"])
-    dp_cp_pg = grid.create_pg(["dp", "cp"])
-
-    # Expert/MoE related groups (refer to original parallel_state.initialize_model_parallel)
-    expert_tp_size = (
-        int(model_config.expert_tensor_parallel_size)
-        if getattr(model_config, "expert_tensor_parallel_size", None)
-        else tp_size
-    )
-    ep_size = (
-        int(model_config.expert_model_parallel_size) if getattr(model_config, "expert_model_parallel_size", 1) else 1
-    )
-    # Expert data-parallel size folds CP into DP (as in original expert rank generator)
-    expt_model_block = expert_tp_size * ep_size * pp_size
-    if world_size % expt_model_block != 0:
-        raise RuntimeError(
-            f"world_size ({world_size}) is not divisible by expert_tensor_model_pipeline size ({expt_model_block})"
-        )
-    expt_dp_size = world_size // expt_model_block
-    use_optimizer_instance_groups = num_distributed_optimizer_instances > 1
-    inner_dp_dim: Optional[str] = None
-    outer_dp_dim: Optional[str] = None
-    if use_optimizer_instance_groups:
-        assert expt_dp_size % num_distributed_optimizer_instances == 0, (
-            "Expert DP size must be divisible by the number of optimizer instances."
-        )
-        inner_expt_dp_size = expt_dp_size // num_distributed_optimizer_instances
-        expert_grid = HyperCommGrid(
-            shape=[expert_tp_size, ep_size, inner_expt_dp_size, num_distributed_optimizer_instances, pp_size],
-            dim_names=["tp", "ep", "inner_dp", "outer_dp", "pp"],
-            rank_offset=0,
-            backend="nccl",
-        )
-        dp_group_dims: list[str] = ["inner_dp", "outer_dp"]
-        inner_dp_dim = "inner_dp"
-        outer_dp_dim = "outer_dp"
-    else:
-        expert_grid = HyperCommGrid(
-            shape=[expert_tp_size, ep_size, expt_dp_size, pp_size],
-            dim_names=["tp", "ep", "dp", "pp"],
-            rank_offset=0,
-            backend="nccl",
-        )
-        dp_group_dims = ["dp"]
-    ep_pg = expert_grid.create_pg(["ep"])
-    expt_tp_pg = expert_grid.create_pg(["tp"])
-    tp_ep_pg = expert_grid.create_pg(["tp", "ep"])
-    tp_ep_pp_pg = expert_grid.create_pg(["tp", "ep", "pp"])
-    expt_dp_pg = expert_grid.create_pg(dp_group_dims)
-
-    # Embedding and position-embedding groups
-    embd_pg = None
-    pos_embd_pg = None
-    # Enumerate ranks per PP group
-    pp_rank_lists = grid._gen_rank_enum(["pp"])
-    # Determine embedding ranks for each pp group
-    embedding_rank_lists: list[list[int]] = []
-    pos_embedding_rank_lists: list[list[int]] = []
-    for ranks in pp_rank_lists:
-        if not ranks:
-            continue
-        if get_embedding_ranks is not None:
-            # Use custom callback to determine embedding ranks
-            embedding_rank_lists.append(get_embedding_ranks(ranks, pp_size))
-        else:
-            # Default: embedding_ranks are first and last pp stage (or only one if pp_size==1)
-            embedding_rank_lists.append([ranks[0]] if len(ranks) == 1 else [ranks[0], ranks[-1]])
-        if get_position_embedding_ranks is not None:
-            # Use custom callback to determine position embedding ranks
-            pos_embedding_rank_lists.append(get_position_embedding_ranks(ranks, pp_size))
-        else:
-            # Default: position embedding ranks are first pp stage only
-            pos_embedding_rank_lists.append([ranks[0]])
-    if embedding_rank_lists:
-        embd_pg, _ = torch.distributed.new_subgroups_by_enumeration(embedding_rank_lists, backend="nccl")
-    if pos_embedding_rank_lists:
-        pos_embd_pg, _ = torch.distributed.new_subgroups_by_enumeration(pos_embedding_rank_lists, backend="nccl")
-
-    # Build Partial-Distributed-Optimizer groups for Expert DP when multiple instances are used.
-    intra_expt_dp_pg = None
-    inter_dist_opt_pg = None
-    intra_dist_opt_pg = None
-    if inner_dp_dim is not None and outer_dp_dim is not None:
-        intra_expt_dp_pg = expert_grid.create_pg([inner_dp_dim])
-        inter_dist_opt_pg = expert_grid.create_pg([outer_dp_dim])
-        # Match distributed optimizer instance grouping from parallel_state:
-        # combine tp-ep-pp ranks across the intra-partial DP slice.
-        intra_dist_opt_pg = expert_grid.create_pg(["tp", "ep", inner_dp_dim, "pp"])
-
-    # Build ProcessGroupCollection with available groups.
-    pg_collection = ProcessGroupCollection(
-        tp=tp_pg,
-        pp=pp_pg,
-        mp=mp_pg,
-        embd=embd_pg,
-        pos_embd=pos_embd_pg,
-        cp=cp_pg,
-        tp_cp=tp_cp_pg,
-        hcp=None,
-        ep=ep_pg,
-        expt_tp=expt_tp_pg,
-        tp_ep=tp_ep_pg,
-        tp_ep_pp=tp_ep_pp_pg,
-        tp_dp_cp=tp_dp_cp_pg,
-        dp=dp_pg,
-        dp_cp=dp_cp_pg,
-        expt_dp=expt_dp_pg,
-        intra_dp_cp=dp_cp_pg,
-        intra_expt_dp=intra_expt_dp_pg if intra_expt_dp_pg is not None else expt_dp_pg,
-        inter_dist_opt=inter_dist_opt_pg,
-        intra_dist_opt=intra_dist_opt_pg,
-    )
-    return pg_collection
-
-
-def _setup_flight_recorder_env(dist_config: DistributedInitConfig) -> None:
-    """Set flight recorder env vars based on config or pre-existing environment.
-
-    Priority: pre-existing env var > config value. If no dump path is provided
-    (either via config or env), no env vars are set.
-    """
-    _fr_path = (
-        os.environ.get("TORCH_FR_DUMP_TEMP_FILE")
-        or os.environ.get("TORCH_NCCL_DEBUG_INFO_TEMP_FILE")
-        or dist_config.flight_recorder_dump_path
-    )
-    if _fr_path is None:
-        return
-
-    _fr_env_defaults = {
-        "TORCH_FR_DUMP_TEMP_FILE": _fr_path,
-        "TORCH_NCCL_DEBUG_INFO_TEMP_FILE": _fr_path,
-        "TORCH_NCCL_TRACE_BUFFER_SIZE": str(dist_config.flight_recorder_trace_buffer_size),
-        "TORCH_NCCL_DUMP_ON_TIMEOUT": str(int(dist_config.flight_recorder_dump_on_timeout)),
-        "TORCH_INCLUDE_STACK_TRACE": str(int(dist_config.flight_recorder_include_stack_trace)),
-        "TORCH_INCLUDE_ONLY_ACTIVE": str(int(dist_config.flight_recorder_include_only_active)),
-        "TORCH_NCCL_EXTRA_DUMP_ON_EXEC": str(int(dist_config.flight_recorder_extra_dump_on_exec)),
-    }
-    for _var, _default in _fr_env_defaults.items():
-        if _var in os.environ:
-            warnings.warn(
-                f"Flight recorder: env var {_var} is already set to "
-                f"'{os.environ[_var]}'; ignoring config value '{_default}'.",
-                stacklevel=2,
-            )
-        else:
-            os.environ[_var] = _default
-    if get_rank_safe() == 0:
-        print(
-            "Flight recorder env vars:\n" + "\n".join(f"  {k}={os.environ[k]}" for k in _fr_env_defaults),
-            flush=True,
-        )
-
-
-def _initialize_distributed(
-    model_config: TransformerConfig,
-    dist_config: DistributedInitConfig,
-    num_distributed_optimizer_instances: int,
-    get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]],
-    get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]],
-    restart_store: Optional[torch.distributed.Store] = None,
-    use_inprocess_restart: bool = False,
-) -> ProcessGroupCollection:
-    """Initialize torch.distributed and core model parallel."""
-
-    device_count = torch.cuda.device_count()
-    if torch.distributed.is_initialized():
-        if get_rank_safe() == 0:
-            print(
-                "torch distributed is already initialized, skipping initialization ...",
-                flush=True,
-            )
-
-    else:
-        if get_rank_safe() == 0:
-            print("> initializing torch distributed ...", flush=True)
-
-        # Manually set the device ids.
-        if device_count > 0:
-            if dist_config.external_gpu_device_mapping:
-                torch.cuda.set_device(0)
-            else:
-                torch.cuda.set_device(get_local_rank_preinit())
-
-        # Set to non-default stream for cudagraph capturing.
-        if model_config.cuda_graph_impl == "transformer_engine":
-            torch.cuda.set_stream(torch.cuda.Stream())
-
-        # Ensure MASTER_ADDR and MASTER_PORT are set for distributed initialization
-        # These may come from torchrun, SLURM, or defaults
-        if "MASTER_ADDR" not in os.environ:
-            os.environ["MASTER_ADDR"] = get_master_addr_safe()
-        if "MASTER_PORT" not in os.environ:
-            os.environ["MASTER_PORT"] = str(get_master_port_safe())
-
-        _setup_flight_recorder_env(dist_config)
-
-        # Call the init process
-        init_process_group_kwargs = {
-            "backend": dist_config.distributed_backend,
-            "world_size": get_world_size_safe(),
-            "rank": get_rank_safe(),
-            "store": restart_store,
-            "timeout": datetime.timedelta(minutes=dist_config.distributed_timeout_minutes),
-        }
-
-        torch.distributed.init_process_group(**init_process_group_kwargs)
-
-        # Force NCCL backend initialization if using in-process restart
-        if use_inprocess_restart:
-            force_nccl_backend_init(torch.cuda.current_device())
-
-        if dist_config.external_gpu_device_mapping:
-            torch.distributed.barrier(device_ids=[0])
-        else:
-            torch.distributed.barrier(device_ids=[get_local_rank_preinit()])
-
-    # Set the tensor model-parallel, pipeline model-parallel, and
-    # data-parallel communicators.
-
-    if device_count == 0:
-        if dist_config.use_decentralized_pg or dist_config.distributed_backend == "nccl":
-            raise RuntimeError("Cannot initialize parallel groups with no CUDA devices available (device_count=0)")
-
-    if dist_config.use_decentralized_pg:
-        # Use HyperCommGrid to create local parallel groups passed through functions
-        # instead of relying on mcore's global parallel state (mpu) variables.
-        parallel_state._set_global_memory_buffer()
-        pg_collection = _create_pg_collection(
-            model_config,
-            num_distributed_optimizer_instances,
-            get_embedding_ranks=get_embedding_ranks,
-            get_position_embedding_ranks=get_position_embedding_ranks,
-        )
-        if get_rank_safe() == 0:
-            tp = int(model_config.tensor_model_parallel_size)
-            pp = int(model_config.pipeline_model_parallel_size)
-            cp = int(model_config.context_parallel_size) if getattr(model_config, "context_parallel_size", 1) else 1
-            dp = torch.distributed.get_world_size() // (tp * pp * cp)
-            print(f"> initialized HyperCommGrid with tp={tp}, pp={pp}, cp={cp}, dp={dp}")
-        return pg_collection
-    else:
-        # Use the original mcore parallel_state.initialize_model_parallel approach
-        if parallel_state.model_parallel_is_initialized():
-            print("model parallel is already initialized")
-        else:
-            parallel_state.initialize_model_parallel(
-                tensor_model_parallel_size=model_config.tensor_model_parallel_size,
-                pipeline_model_parallel_size=model_config.pipeline_model_parallel_size,
-                virtual_pipeline_model_parallel_size=model_config.virtual_pipeline_model_parallel_size,
-                pipeline_model_parallel_comm_backend=model_config.pipeline_model_parallel_comm_backend,
-                context_parallel_size=model_config.context_parallel_size,
-                hierarchical_context_parallel_sizes=model_config.hierarchical_context_parallel_sizes,
-                hybrid_context_parallel=model_config.hybrid_context_parallel,
-                expert_model_parallel_size=model_config.expert_model_parallel_size,
-                num_distributed_optimizer_instances=num_distributed_optimizer_instances,
-                expert_tensor_parallel_size=model_config.expert_tensor_parallel_size,
-                distributed_timeout_minutes=dist_config.distributed_timeout_minutes,
-                nccl_communicator_config_path=dist_config.nccl_communicator_config_path,
-                order="tp-cp-ep-dp-pp" if not dist_config.use_tp_pp_dp_mapping else "tp-cp-ep-pp-dp",
-                get_embedding_ranks=get_embedding_ranks,
-                get_position_embedding_ranks=get_position_embedding_ranks,
-                create_gloo_process_groups=dist_config.use_gloo_process_groups,
-                use_sharp=dist_config.use_sharp,
-                high_priority_stream_groups=dist_config.high_priority_stream_groups,
-                sharp_enabled_group=dist_config.sharp_enabled_group,
-            )
-            if get_rank_safe() == 0:
-                print(
-                    f"> initialized tensor model parallel with size "
-                    f"{parallel_state.get_tensor_model_parallel_world_size()}"
-                )
-                print(
-                    f"> initialized pipeline model parallel with size "
-                    f"{parallel_state.get_pipeline_model_parallel_world_size()}"
-                )
-        # Return a ProcessGroupCollection using mpu process groups
-        return ProcessGroupCollection.use_mpu_process_groups()
-
-
-def _set_random_seed(
-    seed_: int,
-    data_parallel_random_init: bool = False,
-    te_rng_tracker: bool = False,
-    inference_rng_tracker: bool = False,
-    use_cudagraphable_rng: bool = False,
-    *,
-    pg_collection: ProcessGroupCollection,
-) -> None:
-    """Set random seed for reproducability."""
-    assert seed_ is not None and seed_ > 0, f"Seed ({seed_}) should be a positive integer."
-
-    import random
-
-    import numpy as np
-
-    current_rank = torch.distributed.get_rank()
-    # Ensure that different pipeline MP stages get different seeds.
-    pp_rank = torch.distributed.get_group_rank(pg_collection.pp, current_rank)
-    seed = seed_ + (100 * pp_rank)
-    # Ensure different data parallel ranks get different seeds
-    if data_parallel_random_init:
-        dp_rank = torch.distributed.get_group_rank(pg_collection.dp, current_rank)
-        seed = seed + (10 * dp_rank)
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.device_count() > 0:
-        # Derive TP/EP/ETP ranks from provided process groups using helper utils
-        tp_rank = get_pg_rank(pg_collection.tp)
-        ep_rank = get_pg_rank(pg_collection.ep)
-        etp_rank = get_pg_rank(pg_collection.expt_tp)
-
-        tensor_parallel.model_parallel_cuda_manual_seed(
-            seed,
-            te_rng_tracker,
-            inference_rng_tracker,
-            use_cudagraphable_rng,
-            tp_rank=tp_rank,
-            ep_rank=ep_rank,
-            etp_rank=etp_rank,
-        )
-
-
-def _warmup_jit_function(model_config: TransformerConfig, micro_batch_size: int) -> None:
-    """Compilie JIT functions before the main training steps"""
-    if model_config.bf16:
-        dtype = torch.bfloat16
-    elif model_config.fp16:
-        dtype = torch.float16
-    else:
-        dtype = torch.float32
-    # Warmup fused bias+gelu
-    bias = torch.rand(
-        model_config.ffn_hidden_size // model_config.tensor_model_parallel_size,
-        dtype=dtype,
-        device="cuda",
-    )
-    input = torch.rand(
-        (
-            model_config.seq_length // model_config.context_parallel_size,
-            micro_batch_size,
-            model_config.ffn_hidden_size // model_config.tensor_model_parallel_size,
-        ),
-        dtype=dtype,
-        device="cuda",
-    )
-    # Warmup JIT fusions with the input grad_enable state of both forward
-    # prop and recomputation
-    for bias_grad, input_grad in zip([True, True], [False, True]):
-        bias.requires_grad, input.requires_grad = bias_grad, input_grad
-        for _ in range(5):
-            if model_config.activation_func == F.silu:
-                output = bias_swiglu(input, bias)
-            else:
-                output = bias_gelu(bias, input)
-    del bias, input, output
-
-    # Warmup fused bias+dropout+add
-    if model_config.sequence_parallel:
-        tp_world_size = int(model_config.tensor_model_parallel_size)
-        seq_length = model_config.seq_length // tp_world_size
-    else:
-        seq_length = model_config.seq_length
-    input = torch.rand(
-        (
-            seq_length // model_config.context_parallel_size,
-            micro_batch_size,
-            model_config.hidden_size,
-        ),
-        dtype=dtype,
-        device="cuda",
-    )
-    residual = torch.rand(
-        (
-            seq_length // model_config.context_parallel_size,
-            micro_batch_size,
-            model_config.hidden_size,
-        ),
-        dtype=dtype,
-        device="cuda",
-    )
-    bias = torch.rand((model_config.hidden_size), dtype=dtype, device="cuda").expand_as(residual)
-    dropout_rate = 0.1
-    # Warmup JIT fusions with the input grad_enable state of both forward
-    # prop and recomputation
-    for input_grad, bias_grad, residual_grad in zip([False, True], [True, True], [True, True]):
-        input.requires_grad = input_grad
-        bias.requires_grad = bias_grad
-        residual.requires_grad = residual_grad
-        for _ in range(5):
-            output = bias_dropout_add_fused_train([input, bias], residual, dropout_rate)
-    del bias, input, residual, output
-    torch.cuda.empty_cache()
-
-
-def force_nccl_backend_init(device_id: torch.device) -> None:
-    """Force NCCL backend initialization for in-process restart compatibility.
-
-    The nvidia-resiliency-ext in-process restart uses destroy_process_group to
-    terminate the NCCL backend, which does not terminate NCCL kernels if the NCCL
-    backend wasn't fully initialized before additional distributed subgroups are created.
-
-    This function forces full initialization of the NCCL backend by performing
-    a simple all_reduce operation.
-
-    Args:
-        device_id: CUDA device ID to use for the dummy tensor operation
-    """
-    tensor = torch.ones(128, device=device_id)
-    torch.distributed.all_reduce(tensor)
-    torch.cuda.synchronize()
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/mbridge-pretrain.txt b/skills/nemotron-customize/context/mbridge-pretrain.txt
deleted file mode 100644
index 43d487b2e..000000000
--- a/skills/nemotron-customize/context/mbridge-pretrain.txt
+++ /dev/null
@@ -1,13770 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Megatron-Bridge
-├── docs
-│   ├── models
-│   │   ├── llm
-│   │   │   ├── README.md *
-│   │   │   ├── index.md *
-│   │   │   ├── nemotron3-super.md *
-│   │   │   ├── nemotron3.md *
-│   │   │   └── nemotronh.md *
-│   │   ├── vlm
-│   │   └── README.md *
-│   ├── training
-│   │   ├── images
-│   │   │   ├── activation-recomputation-example-1.jpg *
-│   │   │   ├── activation-recomputation-example-2.jpg *
-│   │   │   ├── canonical_lora.png *
-│   │   │   ├── performant_lora.png *
-│   │   │   ├── pp_comm_overlap.png *
-│   │   │   └── tp_comm_overlap.png *
-│   │   ├── README.md *
-│   │   ├── activation-recomputation.md *
-│   │   ├── attention-optimizations.md *
-│   │   ├── callbacks.md *
-│   │   ├── checkpointing.md *
-│   │   ├── communication-overlap.md *
-│   │   ├── config-container-overview.md *
-│   │   ├── cpu-offloading.md *
-│   │   ├── cuda-graphs.md *
-│   │   ├── distillation.md *
-│   │   ├── entry-points.md *
-│   │   ├── hybrid-context-parallel.md *
-│   │   ├── logging.md *
-│   │   ├── megatron-fsdp.md *
-│   │   ├── mixed-precision.md *
-│   │   ├── multi-token-prediction.md *
-│   │   ├── optimizer-scheduler.md *
-│   │   ├── packed-sequences.md *
-│   │   ├── peft.md *
-│   │   ├── profiling.md *
-│   │   ├── pruning.md *
-│   │   ├── resiliency.md *
-│   │   └── training-loop-settings.md *
-│   ├── images
-│   ├── modelopt
-│   ├── releases
-│   ├── README.md *
-│   ├── index.md *
-│   ├── parallelisms.md *
-│   ├── performance-guide.md *
-│   ├── performance-summary.md *
-│   └── recipe-usage.md *
-├── examples
-│   ├── models
-│   │   ├── gpt_oss
-│   │   │   ├── README.md *
-│   │   │   └── slurm_pretrain.sh *
-│   │   ├── nemotron_3
-│   │   │   ├── nano
-│   │   │   │   ├── pretrain_nemotron_3_nano.py * +
-│   │   │   │   └── slurm_pretrain.sh *
-│   │   │   ├── super
-│   │   │   │   ├── pretrain_nemotron_3_super.py * +
-│   │   │   │   └── slurm_pretrain.sh *
-│   │   │   └── README.md *
-│   │   ├── audio_lm
-│   │   │   ├── qwen2_audio
-│   │   │   └── qwen3_asr
-│   │   ├── bailing
-│   │   ├── minimax_m2
-│   │   ├── qwen3_next
-│   │   │   └── conf
-│   │   ├── sarvam
-│   │   └── vlm
-│   │       ├── gemma3_vl
-│   │       ├── glm_45v
-│   │       ├── kimi_k25_vl
-│   │       ├── ministral3
-│   │       ├── nemotron_vl
-│   │       │   └── ...
-│   │       ├── qwen25_omni
-│   │       ├── qwen35_vl
-│   │       ├── qwen3_vl
-│   │       └── qwen_vl
-│   │           └── ...
-│   ├── conversion
-│   │   ├── adapter
-│   │   └── compare_hf_and_megatron
-│   ├── decentralized_pg
-│   ├── diffusion
-│   │   └── recipes
-│   │       ├── flux
-│   │       │   └── ...
-│   │       └── wan
-│   │           └── ...
-│   ├── distillation
-│   │   └── llama
-│   │       └── conf
-│   ├── evaluation
-│   │   └── utils
-│   ├── inference
-│   │   └── vlm
-│   ├── long_context
-│   ├── peft
-│   ├── quantization
-│   │   └── conf
-│   ├── resiliency
-│   │   ├── fault_tolerance
-│   │   └── straggler_detection
-│   └── rl
-├── scripts
-│   ├── training
-│   │   ├── README.md *
-│   │   ├── launch_with_nemo_run.py * +
-│   │   ├── launch_with_sbatch.sh *
-│   │   └── run_recipe.py * +
-│   └── performance
-│       ├── configs
-│       │   ├── deepseek
-│       │   ├── gpt_oss
-│       │   ├── kimi
-│       │   ├── llama
-│       │   ├── nemotronh
-│       │   ├── qwen
-│       │   └── qwen_vl
-│       └── utils
-├── src
-│   └── megatron
-│       └── bridge
-│           ├── recipes
-│           │   ├── nemotronh
-│           │   │   ├── __init__.py * +
-│           │   │   ├── nemotron_3_nano.py * +
-│           │   │   └── nemotron_3_super.py * +
-│           │   ├── utils
-│           │   │   └── dataset_utils.py * +
-│           │   ├── __init__.py * +
-│           │   ├── common.py * +
-│           │   ├── ...
-│           ├── training
-│           │   ├── utils
-│           │   │   └── omegaconf_utils.py * +
-│           │   ├── gpt_step.py * +
-│           │   ├── pretrain.py * +
-│           │   ├── setup.py * +
-│           │   ├── ...
-│           ├── data
-│           │   └── ...
-│           ├── diffusion
-│           │   └── ...
-│           ├── inference
-│           │   └── ...
-│           ├── models
-│           │   └── ...
-│           ├── peft
-│           └── utils
-├── .github
-│   ├── ISSUE_TEMPLATE
-│   ├── actions
-│   │   └── test-template
-│   └── workflows
-│       └── config
-├── .specstory
-├── 3rdparty
-│   └── Megatron-LM
-│       ├── .github
-│       │   ├── ISSUE_TEMPLATE
-│       │   ├── actions
-│       │   │   └── ...
-│       │   ├── scripts
-│       │   └── workflows
-│       │       └── ...
-│       ├── .gitlab
-│       │   ├── scripts
-│       │   └── stages
-│       ├── docker
-│       │   ├── common
-│       │   └── patches
-│       ├── docs
-│       │   ├── advanced
-│       │   ├── api-guide
-│       │   │   └── ...
-│       │   ├── developer
-│       │   ├── discussions
-│       │   │   └── ...
-│       │   ├── get-started
-│       │   ├── images
-│       │   │   └── ...
-│       │   ├── models
-│       │   └── user-guide
-│       │       └── ...
-│       ├── examples
-│       │   ├── academic_paper_scripts
-│       │   │   └── ...
-│       │   ├── bert
-│       │   ├── export
-│       │   │   └── ...
-│       │   ├── gpt3
-│       │   ├── inference
-│       │   │   └── ...
-│       │   ├── llama
-│       │   ├── mamba
-│       │   ├── mimo
-│       │   │   └── ...
-│       │   ├── mixtral
-│       │   ├── multimodal
-│       │   │   └── ...
-│       │   ├── post_training
-│       │   │   └── ...
-│       │   ├── rl
-│       │   │   └── ...
-│       │   └── t5
-│       ├── images
-│       ├── megatron
-│       │   ├── core
-│       │   │   └── ...
-│       │   ├── inference
-│       │   ├── legacy
-│       │   │   └── ...
-│       │   ├── post_training
-│       │   ├── rl
-│       │   │   └── ...
-│       │   └── training
-│       │       └── ...
-│       ├── scripts
-│       ├── tasks
-│       ├── tests
-│       │   ├── functional_tests
-│       │   │   └── ...
-│       │   ├── test_utils
-│       │   │   └── ...
-│       │   └── unit_tests
-│       │       └── ...
-│       └── tools
-│           ├── bert_embedding
-│           └── checkpoint
-├── docker
-│   ├── common
-│   └── patches
-├── skills
-│   ├── adding-model-support
-│   ├── code-style
-│   ├── developer-guide
-│   ├── mlm-bridge-training
-│   ├── multi-node-slurm
-│   ├── parity-testing
-│   ├── perf-techniques
-│   │   ├── cuda-graphs
-│   │   ├── expert-parallel-overlap
-│   │   ├── hybrid-context-parallel
-│   │   ├── megatron-fsdp
-│   │   ├── moe-comm-overlap
-│   │   ├── packed-sequences-long-context
-│   │   ├── parallelism-strategies
-│   │   ├── sequence-packing
-│   │   └── tp-dp-comm-overlap
-│   └── resiliency
-├── tests
-│   ├── functional_tests
-│   │   ├── data
-│   │   │   ├── energon
-│   │   │   └── hf_processors
-│   │   ├── diffusion
-│   │   │   ├── flux
-│   │   │   └── wan
-│   │   ├── inference
-│   │   ├── launch_scripts
-│   │   │   ├── active
-│   │   │   └── flaky
-│   │   ├── models
-│   │   │   ├── qwen3_asr
-│   │   │   └── qwen_audio
-│   │   └── test_groups
-│   │       ├── ckpts
-│   │       │   └── ...
-│   │       ├── converter
-│   │       ├── data
-│   │       │   └── ...
-│   │       ├── diffusion
-│   │       │   └── ...
-│   │       ├── models
-│   │       │   └── ...
-│   │       ├── quantization
-│   │       │   └── ...
-│   │       ├── recipes
-│   │       ├── training
-│   │       └── utils
-│   └── unit_tests
-│       ├── data
-│       │   ├── builders
-│       │   ├── datasets
-│       │   ├── energon
-│       │   ├── mimo
-│       │   └── vlm_datasets
-│       ├── diffusion
-│       │   ├── data
-│       │   │   └── ...
-│       │   ├── model
-│       │   │   └── ...
-│       │   └── recipes
-│       │       └── ...
-│       ├── inference
-│       │   └── vlm
-│       ├── models
-│       │   ├── common
-│       │   ├── decorators
-│       │   ├── deepseek
-│       │   ├── gemma
-│       │   ├── gemma_vl
-│       │   ├── glm
-│       │   ├── glm_vl
-│       │   ├── gpt
-│       │   ├── gpt_oss
-│       │   ├── hf_pretrained
-│       │   ├── kimi
-│       │   ├── kimi_vl
-│       │   ├── llama
-│       │   ├── llama_nemotron
-│       │   ├── mamba
-│       │   ├── mimo
-│       │   ├── minimax_m2
-│       │   ├── ministral3
-│       │   ├── mistral
-│       │   ├── nemotron
-│       │   ├── nemotron_vl
-│       │   ├── nemotronh
-│       │   ├── olmoe
-│       │   ├── qwen
-│       │   ├── qwen3_asr
-│       │   │   └── ...
-│       │   ├── qwen_audio
-│       │   ├── qwen_omni
-│       │   │   └── ...
-│       │   ├── qwen_vl
-│       │   │   └── ...
-│       │   └── sarvam
-│       ├── peft
-│       ├── recipes
-│       │   ├── gemma
-│       │   ├── gpt
-│       │   ├── kimi
-│       │   ├── nemotronh
-│       │   ├── qwen
-│       │   ├── qwen_vl
-│       │   │   └── ...
-│       │   └── utils
-│       ├── scripts
-│       │   └── performance
-│       ├── training
-│       │   ├── mimo
-│       │   ├── mlm_compat
-│       │   ├── post_training
-│       │   └── utils
-│       └── utils
-└── tutorials
-    ├── data
-    │   └── dclm
-    ├── recipes
-    │   └── llama
-    │       └── conf
-    └── training
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; depth cap 3; selected files shown.
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/setup.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import logging
-import time
-from functools import partial
-from typing import Any, Callable, NamedTuple, Optional
-
-from megatron.bridge.models.common import ModelBuilder, ModelConfig
-from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig
-from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig
-from megatron.bridge.models.model_provider import ModelProviderMixin
-from megatron.bridge.models.transformer_config import TransformerConfig
-import torch
-from megatron.core.config import set_experimental_flag
-from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig, finalize_model_grads
-from megatron.core.distributed.fsdp.mcore_fsdp_adapter import FullyShardedDataParallel as megatron_FSDP
-from megatron.core.jit import disable_jit_fuser
-from megatron.core.optimizer import MegatronOptimizer
-from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
-from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.rerun_state_machine import RerunDataIterator
-from megatron.core.transformer import MegatronModule
-
-from megatron.bridge.data.loaders import setup_data_iterators
-from megatron.bridge.training.callbacks import CallbackContext, CallbackManager, should_fire
-from megatron.bridge.models import GPTModelProvider, T5ModelProvider
-from megatron.bridge.training import fault_tolerance
-from megatron.bridge.training.checkpointing import (
-    _load_checkpoint_from_path,
-    checkpoint_exists,
-    CheckpointLoadContext,
-    CheckpointManager,
-    create_checkpoint_manager,
-)
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.initialize import initialize_megatron, set_jit_fusion_options
-from megatron.bridge.training.optim import setup_optimizer
-from megatron.bridge.training.state import GlobalState
-from megatron.bridge.training.tensor_inspect import (
-    finalize_tensor_inspect_post_model_initialization,
-    initialize_tensor_inspect_pre_model_initialization,
-)
-from megatron.bridge.training.tokenizers.tokenizer import build_tokenizer
-from megatron.bridge.training.utils.log_utils import append_to_progress_log, barrier_and_log, setup_logging
-from megatron.bridge.utils.common_utils import get_rank_safe, print_rank_0
-
-class SetupOutput(NamedTuple):
-    """Represents the output of the main setup function.
-
-    Contains all the initialized components necessary for training or evaluation.
-
-    Attributes:
-        state: The global state object holding configuration and runtime information.
-        model: The initialized Megatron model.
-        optimizer: The initialized optimizer.
-        scheduler: The initialized learning rate scheduler.
-        train_data_iterator: The data iterator for the training dataset, if applicable.
-        valid_data_iterator: The data iterator for the validation dataset, if applicable.
-        test_data_iterator: The data iterator for the testing dataset, if applicable.
-        checkpoint_manager: The checkpoint manager for save/load operations.
-        pg_collection: The process group collection initialized for this run.
-    """
-
-    state: GlobalState
-    model: MegatronModule
-    optimizer: MegatronOptimizer
-    scheduler: OptimizerParamScheduler
-    train_data_iterator: Optional[RerunDataIterator | list[RerunDataIterator]]
-    valid_data_iterator: Optional[RerunDataIterator | list[RerunDataIterator]]
-    test_data_iterator: Optional[RerunDataIterator | list[RerunDataIterator]]
-    checkpoint_manager: CheckpointManager
-    pg_collection: ProcessGroupCollection
-
-
-def setup(
-    state: GlobalState,
-    train_valid_test_datasets_provider: Callable[..., tuple[Optional[Any], Optional[Any], Optional[Any]]],
-    get_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None,
-    get_position_embedding_ranks: Optional[Callable[[list[int], Optional[int]], list[int]]] = None,
-    restart_store: Optional[torch.distributed.Store] = None,
-    callback_manager: CallbackManager | None = None,
-) -> SetupOutput:
-    """Initialize the training/evaluation environment using an existing GlobalState.
-
-    Performs all runtime setup using the provided `state` and its attached config (`state.cfg`).
-    This includes:
-      - enabling Megatron-Core experimental features
-      - initializing async checkpoint workers (if enabled)
-      - logging setup
-      - torch.distributed and model-parallel initialization (via initialize_megatron)
-      - tokenizer/model/optimizer/scheduler construction
-      - optional checkpoint load
-      - dataloader setup
-
-    Args:
-        state: The GlobalState instance to populate and use throughout setup.
-        train_valid_test_datasets_provider: Callable returning the train/valid/test datasets or iterators.
-        get_embedding_ranks: Optional function to determine embedding layer ranks for model-parallel init.
-        get_position_embedding_ranks: Optional function to determine positional embedding ranks.
-        restart_store: Optional torch.distributed Store used when in-process restart is enabled.
-        callback_manager: Optional CallbackManager whose on_data_init_start hook is fired
-            after the model/optimizer/checkpoint are ready but before any dataset files are
-            opened. Use this for JIT warmup with mock data and MLPerf init_stop/run_start
-            logging to ensure no real dataset I/O occurs before run_start is recorded.
-
-    Returns:
-        SetupOutput containing the populated state, model, optimizer, scheduler, dataloaders, and ckpt context.
-    """
-    cfg = state.cfg
-    maybe_log_and_save_config(cfg)
-
-    # Conditionally enable experimental features for Megatron Core
-    set_experimental_flag(cfg.dist.enable_megatron_core_experimental)
-
-    # Disable the JIT fuser if requested
-    if cfg.dist.disable_jit_fuser:
-        print_rank_0("Disabling JIT fuser.")
-        disable_jit_fuser()
-
-    # Initialize async checkpoint worker if enabled (idempotent if already initialized)
-    state.initialize_async_checkpoint_worker()
-
-    setup_logging(
-        logging_level=cfg.logger.logging_level,
-        filter_warning=cfg.logger.filter_warnings,
-        modules_to_filter=cfg.logger.modules_to_filter,
-        set_level_for_all_loggers=cfg.logger.set_level_for_all_loggers,
-    )
-
-    # pg_collection is returned from initialize_megatron:
-    # - When use_decentralized_pg=True: uses HyperCommGrid to create local process groups
-    # - When use_decentralized_pg=False: uses mpu's global parallel state
-    pg_collection = initialize_megatron(
-        cfg=cfg,
-        get_embedding_ranks=get_embedding_ranks,
-        get_position_embedding_ranks=get_position_embedding_ranks,
-        restart_store=restart_store,
-    )
-
-    # Set CPU affinity for optimal host-device transfers when fine-grained activation offloading is enabled
-    if cfg.model.fine_grained_activation_offloading:
-        from megatron.core.pipeline_parallel.utils import set_ideal_affinity_for_current_gpu
-
-        set_ideal_affinity_for_current_gpu()
-
-    timers = state.timers
-
-    if cfg.logger.log_progress:
-        append_to_progress_log(cfg.checkpoint.save, "Starting job")
-
-    if cfg.ft and cfg.ft.enable_ft_package:
-        fault_tolerance.setup(cfg, state)
-        fault_tolerance.maybe_setup_simulated_fault(cfg.ft)
-
-    # Set pytorch JIT layer fusion options and warmup JIT functions.
-    set_jit_fusion_options(cfg.model, cfg.train.micro_batch_size)
-
-    # Adjust the startup time so it reflects the largest value.
-    # This will be closer to what scheduler will see (outside of
-    # image ... launches.
-    start_time_tensor = torch.tensor([state.start_time], dtype=torch.double, device="cuda")
-    torch.distributed.all_reduce(start_time_tensor, op=torch.distributed.ReduceOp.MIN)
-    state.start_time = start_time_tensor.item()
-
-    print_rank_0("time to initialize megatron (seconds): {:.3f}".format(time.time() - state.start_time))
-    barrier_and_log("after megatron is initialized")
-
-    # Create checkpoint manager for save/load operations.
-    checkpoint_manager = create_checkpoint_manager(cfg.checkpoint)
-
-    # Tokenizer
-    timers("tokenizer-setup", log_level=0).start(barrier=True)
-    tokenizer = build_tokenizer(cfg.tokenizer)
-    # Handle model vocab_size configuration with proper validation
-    cfg.model.vocab_size, cfg.model.should_pad_vocab = _validate_and_set_vocab_size(
-        model_vocab_size=cfg.model.vocab_size,
-        tokenizer_vocab_size=tokenizer.vocab_size,
-    )
-
-    cfg.dataset.tokenizer = tokenizer
-    timers("tokenizer-setup").stop()
-    barrier_and_log("after tokenizer is built")
-
-    # Initialize NVIDIA DLFw Inspect early (this must happen before TE modules are constructed)
-    initialize_tensor_inspect_pre_model_initialization(cfg.tensor_inspect)
-
-    # Model, optimizer, and learning rate.
-    timers("model-and-optimizer-setup", log_level=0).start(barrier=True)
-
-    # Register PEFT pre-wrap hook if PEFT is configured
-    if cfg.peft is not None:
-        peft_hook = _create_peft_pre_wrap_hook(cfg, state)
-        _register_pre_wrap_hook(cfg.model, peft_hook)
-        print_rank_0("Registered PEFT pre-wrap hook")
-
-    if getattr(cfg.model, "restore_modelopt_state", False):
-        from megatron.bridge.training.post_training.checkpointing import load_modelopt_state
-
-        def modelopt_pre_wrap_hook(model):
-            from megatron.bridge.training.post_training.checkpointing import has_modelopt_state
-
-            # Check which checkpoint path has modelopt state
-            if cfg.checkpoint.pretrained_checkpoint and has_modelopt_state(cfg.checkpoint.pretrained_checkpoint):
-                checkpoint_path = cfg.checkpoint.pretrained_checkpoint
-            elif cfg.checkpoint.load and has_modelopt_state(cfg.checkpoint.load):
-                checkpoint_path = cfg.checkpoint.load
-            else:
-                raise RuntimeError(
-                    f"No modelopt_state found in pretrained_checkpoint={cfg.checkpoint.pretrained_checkpoint} "
-                    f"or load={cfg.checkpoint.load}"
-                )
-
-            load_modelopt_state(model, checkpoint_path)
-            return model
-
-        _register_pre_wrap_hook(cfg.model, modelopt_pre_wrap_hook)
-
-    model = _build_distributed_model(cfg, pg_collection)
-
-    cfg.model.timers = timers
-    cfg.optimizer.timers = timers
-    optimizer, scheduler = setup_optimizer(
-        optimizer_config=cfg.optimizer,
-        scheduler_config=cfg.scheduler,
-        model=model,
-        use_gloo_process_groups=cfg.dist.use_gloo_process_groups,
-        # Only pass pg_collection when use_decentralized_pg is True.
-        # When False, mcore's optimizer will use parallel_state directly which supports Gloo.
-        pg_collection=pg_collection if cfg.dist.use_decentralized_pg else None,
-        optimizer_config_override_provider=cfg.optimizer_config_override_provider,
-    )
-    timers("model-and-optimizer-setup").stop()
-    barrier_and_log("after model, optimizer, and learning rate scheduler are built")
-
-    # Check if a local (non-persistent) checkpoint is available.  Local
-    # checkpoints are independent of global ones — they don't write
-    # latest_train_state.pt to load_dir, so checkpoint_exists() won't
-    # find them.
-    _ckpt_ctx = getattr(checkpoint_manager, "checkpointing_context", {})
-    has_local_checkpoint = (
-        "local_checkpoint_manager" in _ckpt_ctx
-        and _ckpt_ctx["local_checkpoint_manager"].find_latest() != -1
-    )
-
-    # For PEFT, the pretrained checkpoint is loaded in the pre-wrap hook
-    if cfg.peft is not None:
-        should_load_checkpoint = cfg.checkpoint.load is not None and checkpoint_exists(cfg.checkpoint.load)
-        if should_load_checkpoint:
-            # The finetune toggle is explicitly set to True in order to avoid loading optimizer and RNG states
-            # This is switched off here in order to load these states from the checkpoint
-            cfg.checkpoint.finetune = False
-    else:
-        should_load_checkpoint = (
-            (cfg.checkpoint.load is not None and checkpoint_exists(cfg.checkpoint.load))
-            or (
-                cfg.checkpoint.pretrained_checkpoint is not None
-                and checkpoint_exists(cfg.checkpoint.pretrained_checkpoint)
-            )
-            or has_local_checkpoint
-        )
-
-    if should_load_checkpoint:
-        timers("load-checkpoint", log_level=0).start(barrier=True)
-        checkpoint_manager.load(CheckpointLoadContext(
-            state=state,
-            model=model,
-            optimizer=optimizer,
-            opt_param_scheduler=scheduler,
-            skip_load_to_model_and_opt=cfg.dist.use_torch_fsdp2 or cfg.dist.use_megatron_fsdp,
-        ))
-        timers("load-checkpoint").stop(barrier=True)
-        timers.log(["load-checkpoint"])
-
-    # Finalize NVIDIA DLFw Inspect after model is built (attach loggers, module names, parallelism groups)
-    finalize_tensor_inspect_post_model_initialization(
-        cfg.tensor_inspect,
-        model,
-        state.tensorboard_logger,
-        state.wandb_logger,
-        comet_logger=state.comet_logger,
-        current_training_step=state.train_state.step,
-    )
-
-    _update_model_config_funcs(
-        model,
-        cfg.model.transformer if isinstance(cfg.model, (GPTModelConfig, MambaModelConfig)) else cfg.model,
-        cfg.ddp,
-        optimizer,
-        align_grad_reduce=cfg.dist.align_grad_reduce,
-        pg_collection=pg_collection,
-    )
-
-    # Fire on_data_init_start before any dataset files are opened.
-    # This is the correct place for JIT warmup with mock data and MLPerf
-    # init_stop/run_start logging.
-    if should_fire(callback_manager, "on_data_init_start"):
-        context = CallbackContext(
-            state=state,
-            model=model,
-            optimizer=optimizer,
-            scheduler=scheduler,
-            user_state=callback_manager.user_state,
-        )
-        callback_manager.fire("on_data_init_start", context)
-
-    # Data stuff.
-    timers("train/valid/test-data-iterators-setup", log_level=0).start(barrier=True)
-    if "tokenizer" in inspect.signature(train_valid_test_datasets_provider).parameters:
-        train_valid_test_datasets_provider = partial(train_valid_test_datasets_provider, tokenizer=tokenizer)
-    if "pg_collection" in inspect.signature(train_valid_test_datasets_provider).parameters:
-        train_valid_test_datasets_provider = partial(train_valid_test_datasets_provider, pg_collection=pg_collection)
-
-    train_data_iterator, valid_data_iterator, test_data_iterator = setup_data_iterators(
-        cfg=cfg,
-        train_state=state.train_state,
-        model_length=len(model),
-        train_valid_test_datasets_provider=train_valid_test_datasets_provider,
-        dp_group=pg_collection.dp,
-    )
-    timers("train/valid/test-data-iterators-setup").stop()
-    barrier_and_log("after dataloaders are built")
-
-    # if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
-    #     ft_integration.get_rank_monitor_client().init_workload_monitoring()
-    #     ft_timeouts = ft_integration.get_rank_monitor_client().timeouts
-    #     print_rank_0(f"Fault tolerance client initialized. Timeouts: {ft_timeouts}")
-
-    # Print setup timing.
-    print_rank_0("done with setup ...")
-    timers.log(["model-and-optimizer-setup", "train/valid/test-data-iterators-setup"], barrier=True)
-
-    return SetupOutput(
-        state,
-        model,
-        optimizer,
-        scheduler,
-        train_data_iterator,
-        valid_data_iterator,
-        test_data_iterator,
-        checkpoint_manager,
-        pg_collection,
-    )
-
-
-def _register_pre_wrap_hook(model_cfg: ModelConfig | ModelProviderMixin, hook):
-    """Register a pre-wrap hook on either ModelConfig or ModelProviderMixin."""
-    if isinstance(model_cfg, ModelConfig):
-        model_cfg.pre_wrap_hooks.append(hook)
-    else:
-        model_cfg.register_pre_wrap_hook(hook)
-
-
-def _build_distributed_model(cfg: ConfigContainer, pg_collection: ProcessGroupCollection) -> list[MegatronModule]:
-    """Build distributed model from either ModelConfig or ModelProviderMixin."""
-    model_config = cfg.model
-    if isinstance(model_config, ModelConfig):
-        builder_cls = model_config.get_builder_cls()
-        builder = builder_cls(model_config)
-        return builder.build_distributed_models(
-            pg_collection=pg_collection,
-            ddp_config=cfg.ddp,
-            overlap_param_gather_with_optimizer_step=cfg.optimizer.overlap_param_gather_with_optimizer_step,
-            use_megatron_fsdp=cfg.dist.use_megatron_fsdp,
-            use_torch_fsdp2=cfg.dist.use_torch_fsdp2,
-            data_parallel_random_init=cfg.rng.data_parallel_random_init,
-        )
-    else:
-        return model_config.provide_distributed_model(
-            ddp_config=cfg.ddp,
-            use_megatron_fsdp=cfg.dist.use_megatron_fsdp,
-            use_torch_fsdp2=cfg.dist.use_torch_fsdp2,
-            overlap_param_gather_with_optimizer_step=cfg.optimizer.overlap_param_gather_with_optimizer_step,
-            data_parallel_random_init=cfg.rng.data_parallel_random_init,
-            pg_collection=pg_collection,
-        )
-
-
-def _update_model_config_funcs(
-    model: MegatronModule,
-    model_config: TransformerConfig,
-    ddp_config: DistributedDataParallelConfig,
-    optimizer: Optional[MegatronOptimizer],
-    *,
-    align_grad_reduce: bool = True,
-    pg_collection: Optional[ProcessGroupCollection] = None,
-) -> None:
-    """Update model config sync funcs based on initialized model."""
-    if isinstance(model[0], (DistributedDataParallel, megatron_FSDP)) and ddp_config.overlap_grad_reduce:
-        assert model_config.no_sync_func is None, (
-            "When overlap_grad_reduce is True, config.no_sync_func must be None; "
-            "a custom no_sync_func is not supported when overlapping grad-reduce"
-        )
-        model_config.no_sync_func = [model_chunk.no_sync for model_chunk in model]
-        if len(model) == 1:
-            model_config.no_sync_func = model_config.no_sync_func[0]
-        if align_grad_reduce:
-            model_config.grad_sync_func = [model_chunk.start_grad_sync for model_chunk in model]
-            if len(model) == 1:
-                model_config.grad_sync_func = model_config.grad_sync_func[0]
-    if ddp_config.overlap_param_gather and ddp_config.align_param_gather:
-        model_config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model]
-        if len(model) == 1:
-            model_config.param_sync_func = model_config.param_sync_func[0]
-    if optimizer is not None:
-        model_config.finalize_model_grads_func = partial(finalize_model_grads, pg_collection=pg_collection)
-        model_config.grad_scale_func = optimizer.scale_loss
-
-
-def _create_peft_pre_wrap_hook(
-    cfg: ConfigContainer, state: GlobalState
-) -> Callable[[list[MegatronModule]], list[MegatronModule]]:
-    """Create a pre-wrap hook that handles PEFT logic.
-
-    This hook is executed before the model is wrapped with DDP/FSDP and handles:
-    1. Loading pretrained checkpoints for PEFT
-    2. Applying PEFT transformation to the model
-
-    Args:
-        cfg: Configuration container
-        state: Global state object containing timers and other state
-
-    Returns:
-        A callable hook that can be registered with the model provider
-    """
-
-    def peft_pre_wrap_hook(model: list[MegatronModule]) -> list[MegatronModule]:
-        """Pre-wrap hook that handles PEFT transformation.
-
-        Args:
-            model: List of base model modules before distributed wrapping
-
-        Returns:
-            List of potentially PEFT-transformed model modules
-        """
-        # Only apply PEFT logic if PEFT is configured
-        if cfg.peft is None:
-            return model
-
-        print_rank_0("Applying PEFT pre-wrap hook...")
-
-        # Load pretrained checkpoint if available
-        if cfg.checkpoint.pretrained_checkpoint is None or not checkpoint_exists(cfg.checkpoint.pretrained_checkpoint):
-            raise ValueError(f"Invalid pretrained checkpoint directory found: {cfg.checkpoint.pretrained_checkpoint}")
-
-        # Explicitly set finetune to avoid loading optimizer and RNG states
-        cfg.checkpoint.finetune = True
-        state.timers("load-pretrained-checkpoint", log_level=0).start(barrier=True)
-        print_rank_0(f"Loading base model weights from: {cfg.checkpoint.pretrained_checkpoint}")
-
-        # Directly call load_checkpoint_from path in order to avoid
-        # the load directory overriding the pretrained checkpoint path
-        # This is needed to initialize the base model weights first, and then conditionally load adapter states after
-        _load_checkpoint_from_path(
-            load_dir=cfg.checkpoint.pretrained_checkpoint,
-            state=state,
-            model=model,
-            optimizer=None,  # Don't load optimizer - will be created after PEFT
-            opt_param_scheduler=None,  # Don't load scheduler - will be created after PEFT
-            checkpointing_context={},
-            skip_load_to_model_and_opt=False,
-            ignore_ckpt_step=True,  # ckpt_step applies only to adapter checkpoints, not pretrained base model
-        )
-        state.timers("load-pretrained-checkpoint").stop(barrier=True)
-        state.timers.log(["load-pretrained-checkpoint"])
-
-        # Apply PEFT transformation
-        transformed_model = _apply_peft_transformation(cfg.peft, model)
-
-        return transformed_model
-
-    return peft_pre_wrap_hook
-
-
-def _apply_peft_transformation(peft, base_model: list[MegatronModule]) -> list[MegatronModule]:
-    """Apply PEFT transformation to the base model.
-
-    Args:
-        peft: PEFT configuration/object
-        base_model: Base model before PEFT transformation
-
-    Returns:
-        Model with PEFT transformation applied
-    """
-    print_rank_0("Applying PEFT transformation...")
-    transformed_model = peft(base_model, training=True)
-    peft.set_params_to_save(transformed_model)
-
-    # Log PEFT statistics
-    model_to_analyze = transformed_model[0] if isinstance(transformed_model, list) else transformed_model
-    total_params = 0
-    trainable_params = 0
-    for param in model_to_analyze.parameters():
-        param_count = param.numel()
-        total_params += param_count
-        if param.requires_grad:
-            trainable_params += param_count
-
-    print_rank_0("PEFT Statistics:")
-    print_rank_0(f"  Total parameters: {total_params:,}")
-    print_rank_0(f"  Trainable parameters: {trainable_params:,}")
-    print_rank_0(f"  Trainable percentage: {100 * trainable_params / total_params:.2f}%")
-
-    return transformed_model
-
-
-def _validate_and_set_vocab_size(model_vocab_size: Optional[int], tokenizer_vocab_size: int) -> tuple[int, bool]:
-    """Validate and determine the correct vocab size for the model.
-
-    Args:
-        model_vocab_size: Vocab size set in model config (can be None)
-        tokenizer_vocab_size: Unpadded tokenizer vocab size
-
-    Returns:
-        tuple[int, bool]: The validated unpadded vocab size and padding flag
-            - vocab_size: The validated unpadded vocab size to use for the model
-            - should_pad_vocab: True if vocab should be padded, False otherwise
-
-    Raises:
-        ValueError: If model vocab size is invalid
-    """
-    if model_vocab_size is None:
-        # If model vocab size is not set, use the tokenizer's vocab size
-        # Enable padding since this came from tokenizer
-        return tokenizer_vocab_size, True
-    elif model_vocab_size < tokenizer_vocab_size:
-        # Vocab size smaller than tokenizer
-        raise ValueError(
-            f"Model vocab_size ({model_vocab_size}) cannot be smaller than tokenizer's vocab_size "
-            f"({tokenizer_vocab_size})."
-        )
-    else:
-        # Model vocab size is explicitly set and is >= tokenizer vocab size
-        # Disable padding since this was explicitly set
-        if model_vocab_size > tokenizer_vocab_size:
-            logging.info(
-                f"Using preset vocab_size: {model_vocab_size} over the tokenizer vocab_size: {tokenizer_vocab_size}, dummy tokens:"
-                f" {model_vocab_size - tokenizer_vocab_size}."
-            )
-        return model_vocab_size, False
-
-
-def maybe_log_and_save_config(cfg: ConfigContainer) -> None:
-    """Save configuration to disk and log non-default values on rank 0.
-
-    Instead of printing the full config YAML, this now logs only the values
-    that differ from Megatron Core defaults, making it easier to spot
-    unintended configuration deviations.
-
-    The full config can still be saved to a file via logger.save_config_filepath.
-    """
-
-    if get_rank_safe() != 0:
-        return
-
-    if cfg.logger.save_config_filepath is not None:
-        try:
-            cfg.to_yaml(cfg.logger.save_config_filepath)
-        except Exception as e:
-            print_rank_0(f"Error saving config to file {cfg.logger.save_config_filepath}: {e}")
-
-    cfg.log_non_default_values()
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/training-loop-settings.md
-```md
-# Training Loop Configuration
-
-The {py:class}`bridge.training.config.TrainingConfig` contains settings related to the training loop bounds, exit conditions, validation, batch sizing, and memory management.
-
-## Key Parameters
-
-Configure these parameters to control core training behavior, resource utilization, and monitoring across distributed setups.
-
-### Batch Configuration
-Define how data is batched and distributed across devices during training.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `micro_batch_size` | `Optional[int]` | `None` | Batch size per model instance (local batch size) |
-| `global_batch_size` | `Optional[int]` | `None` | Training batch size across all devices |
-| `rampup_batch_size` | `Optional[list[int]]` | `None` | Batch size ramp up: `[start_size, increment, ramp_samples]` |
-| `decrease_batch_size_if_needed` | `bool` | `False` | Automatically decrease batch size if needed for fault tolerance |
-
-The relationship between batch sizes:
-- **Global batch size** = `micro_batch_size` × `data_parallel_size` × `gradient_accumulation_steps`
-- If `global_batch_size` is not set, it defaults to `micro_batch_size` × `data_parallel_size`
-
-### Training Duration
-
-Control when training stops using iteration counts, sample counts, or time-based limits.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `train_iters` | `Optional[int]` | `None` | Total number of iterations to train |
-| `train_samples` | `Optional[int]` | `None` | Total number of samples to train |
-| `exit_interval` | `Optional[int]` | `None` | Exit after iteration divisible by this value |
-| `exit_duration_in_mins` | `Optional[int]` | `None` | Exit after this many minutes |
-
-**Training Mode Selection**
-
-Megatron-Bridge supports two modes for specifying training duration:
-
-1. **Iteration-based training**: Specify `train_iters` to control the total number of training iterations.
-2. **Sample-based training**: Specify `train_samples` to control the total number of training samples.
-
-**Important constraints:**
-- You must specify **exactly one** of `train_iters` or `train_samples` - not both.
-- When using `train_samples`, training iterations are automatically calculated as `train_samples // global_batch_size`.
-- Batch size rampup (`rampup_batch_size`) is not currently supported with sample-based training.
-- Your scheduler configuration should match your training mode (see [Learning Rate Scheduling](optimizer-scheduler.md#learning-rate-scheduling)).
-
-### Validation
-Configure validation frequency, duration, and evaluation-only modes.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `eval_iters` | `int` | `100` | Number of iterations for validation/test evaluation |
-| `eval_interval` | `Optional[int]` | `1000` | Interval between validation runs |
-| `skip_train` | `bool` | `False` | Skip training, only do evaluation and exit |
-
-**Note:** To control validation behavior:
-- Set `eval_iters` to `0` to disable validation entirely (both during and after training).
-- Set `eval_interval` to `None` to skip validation during training, but still run validation after training completes.
-
-### Memory Management
-Control GPU memory cleanup and garbage collection to prevent memory issues during training.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `empty_unused_memory_level` | `Literal[0, 1, 2]` | `0` | Call `torch.cuda.empty_cache()` each iteration (0=off, 1=moderate, 2=aggressive) |
-| `manual_gc` | `bool` | `False` | Synchronize Python garbage collection across ranks to avoid stragglers |
-| `manual_gc_interval` | `int` | `0` | Training step interval for manual garbage collection (0=disabled) |
-| `manual_gc_eval` | `bool` | `True` | Enable garbage collection during evaluation when using manual GC |
-
-### Signal Handling and Exit Conditions
-Set up automatic checkpoint saving and clean exit procedures for signal-based interruptions.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `exit_signal_handler` | `bool` | `False` | Save checkpoint and shutdown gracefully on signal detection |
-| `exit_signal` | `int` | `signal.SIGTERM` | Signal to handle for graceful shutdown |
-| `exit_signal_handler_for_dataloader` | `bool` | `False` | Use signal handler for dataloader workers |
-
-### Performance Monitoring
-Monitor training consistency and synchronization across distributed processes.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `check_weight_hash_across_dp_replicas_interval` | `Optional[int]` | `None` | Check weight hash consistency across data parallel replicas |
-| `train_sync_interval` | `Optional[int]` | `None` | CPU-GPU synchronization interval to prevent CPU running ahead |
-
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/distillation.md
-```md
-# Knowledge Distillation
-
-Megatron Bridge provides a streamlined setup for Knowledge Distillation (KD) training, making it easy to enable and integrate into your workflow. This section explains how to use this feature effectively.
-
-Knowledge Distillation is a technique where a pre-trained model (the "teacher") transfers its learned knowledge to a second model (the "student"), which is typically smaller and faster. This process helps the student model learn more efficiently by mimicking the behavior of the teacher. KD offers two key advantages over traditional training: faster convergence and higher final accuracy.
-
-In Megatron Bridge, KD is enabled by NVIDIA Model Optimizer (ModelOpt) — a library to optimize deep-learning models for inference on GPUs.
-
-## Knowledge Distillation Process
-
-The KD process involves these steps:
-
-1. **Loads Checkpoints**: Loads both the student and teacher model checkpoints.
-2. **Replaces Loss Function**: Replaces the standard loss function with the KL-Divergence between the output logits (and potentially additional losses between pairs of intermediate model states).
-3. **Trains Models**: Runs forward passes on both models, but executes the backward pass only on the student model.
-4. **Saves Checkpoints**: Saves only the student model checkpoint, allowing it to be used later in the same manner as before.
-
-## Limitations
-
-* Only GPT-based checkpoints are currently supported.
-* Student and teacher models must support the same parallelism strategy.
-* If Pipeline Parallelism is enabled, intermediate-state based KD losses are only supported on the final pipeline stage.
-
-## Configuration
-
-### Knowledge Distillation Config
-
-You can configure the KD process via the `ModelOptDistillConfig` class or a YAML file. The configuration includes:
-
-* `logit_layers`: The layer names of student and teacher model logit layers. These names correspond to the PyTorch submodule attributes of the Megatron Core model. (For GPT-based models, this is `"output_layer"`). Default: `["output_layer", "output_layer"]`
-* `intermediate_layer_pairs`: A list of pairs of intermediate layer names. These pairs will by default have a Cosine-Similarity loss between them, and if tensor-parallelism is enabled, these layers must have sequence parallel outputs (i.e. LayerNorms), as Cosine loss cannot have a split hidden dimension. Default: `[["decoder.final_layernorm", "decoder.final_layernorm"]]`
-* `skip_lm_loss`: Whether to skip the default language modeling (LM) loss. If `false`, it will be added to the distillation loss. (Note it consumes more memory). Default: `true`
-* `kd_loss_scale`: Relative scale factor for the distillation loss. The cumulative logits-and-intermediate loss gets scaled to `kd_loss_scale` times the magnitude of the LM loss. Not used if `skip_lm_loss` is `true`. Default: `1.0`
-* `logit_kl_temperature`: Temperature variable for KL Divergence loss calculation. Default: `1.0`
-
-Example YAML configuration:
-
-```yaml
-logit_layers: ["output_layer", "output_layer"]
-intermediate_layer_pairs:
-  - ["decoder.final_layernorm", "decoder.final_layernorm"]
-logit_kl_temperature: 2.0
-```
-
-## Usage
-
-### Basic Usage with Default Configuration
-
-The simplest way to run knowledge distillation is to use or adapt one of the provided recipe scripts. Here's an example for distilling Llama3.2-3B into Llama3.2-1B:
-
-```bash
-uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py
-```
-
-### Using a Custom YAML Config File
-
-You can provide a custom YAML configuration file to override default settings:
-
-```bash
-uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \
-    --config-file my_custom_config.yaml
-```
-
-### Using CLI Overrides
-
-Megatron Bridge supports Hydra-style CLI overrides for flexible configuration:
-
-```bash
-uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \
-    model.tensor_model_parallel_size=2 \
-    model.teacher.tensor_model_parallel_size=2
-```
-
-### Combining YAML and CLI Overrides
-
-CLI overrides take precedence over YAML configuration:
-
-```bash
-uv run -m torch.distributed.run --nproc_per_node=2 examples/distillation/llama/distill_llama32_3b-1b.py \
-    --config-file conf/my_config.yaml \
-    train.global_batch_size=512
-```
-
-## Model Support
-
-Currently, distillation is supported for GPT and Mamba-based models
-
-To enable distillation for a model:
-
-1. Set the `teacher` attribute to the teacher model configuration
-2. Configure `kd_config` with desired distillation settings (else uses default)
-3. Use `convert_to_distillation_provider()` to convert your existing model provider
-
-## Checkpointing
-
-During distillation training:
-
-* Only the **student model** checkpoints are saved
-* Teacher model remains frozen and is not modified
-* Checkpoints can be used for inference or further training like any standard checkpoint
-
-## Best Practices
-
-1. **Match Parallelism**: Ensure student and teacher use compatible parallelism configurations
-2. **Monitor Loss**: Track both distillation loss and (if enabled) language modeling loss
-3. **Batch Size**: Use larger batch sizes for better stability during distillation
-4. **Learning Rate**: Start with a smaller LR than pretraining
-5. **Data Quality**: Use high-quality, diverse training data for best distillation results
-
-## Troubleshooting
-
-### Out of Memory Errors
-
-* Reduce `train.micro_batch_size`
-* Increase parallelism sizes
-* Set `model.kd_config.skip_lm_loss = True` to save memory
-
-## References
-
-For more information on the underlying implementation, see:
-* [NVIDIA Model Optimizer](https://github.com/NVIDIA/Model-Optimizer)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/README.md
-```md
-# Training and Customization
-
-This directory contains comprehensive documentation for training and customizing models with Megatron Bridge. Learn how to configure training, optimize performance, and customize training workflows.
-
-## Quick Navigation
-
-### I want to
-
-**🚀 Get started with training**
-→ Start with [Configuration Container Overview](config-container-overview.md) to understand the training setup
-
-**⚙️ Configure training parameters**
-→ See [Training Loop Settings](training-loop-settings.md) and [Optimizer & Scheduler](optimizer-scheduler.md)
-
-**📊 Monitor and profile training**
-→ Check [Logging](logging.md) and [Profiling](profiling.md) guides
-
-**💾 Manage checkpoints**
-→ Read [Checkpointing](checkpointing.md) for saving and resuming training
-
-**⚡ Optimize performance**
-→ Explore [Performance Guide](../performance-guide.md) and [Performance Summary](../performance-summary.md)
-
-**🔧 Customize training**
-→ See [PEFT](peft.md), [Distillation](distillation.md), [Entry Points](entry-points.md), and [Callbacks](callbacks.md)
-
-## Core Training Documentation
-
-### Configuration and Setup
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Configuration Container Overview](config-container-overview.md)** | Central configuration object for all training settings | First time setting up training |
-| **[Entry Points](entry-points.md)** | Training entry points and execution flow | Understanding how training starts |
-| **[Training Loop Settings](training-loop-settings.md)** | Training loop parameters and configuration | Configuring batch sizes, iterations, validation |
-
-### Optimization and Performance
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Optimizer & Scheduler](optimizer-scheduler.md)** | Optimizer and learning rate scheduler configuration | Setting up optimization |
-| **[Mixed Precision](mixed-precision.md)** | Mixed precision training for memory efficiency | Reducing memory usage |
-| **[Communication Overlap](communication-overlap.md)** | Overlapping communication with computation | Optimizing distributed training |
-| **[Hybrid Context Parallel](hybrid-context-parallel.md)** | Hierarchical `a2a+p2p` context parallel guidance | Advanced long-sequence scaling |
-| **[Attention Optimizations](attention-optimizations.md)** | Optimizing attention mechanisms | Improving training speed |
-| **[Activation Recomputation](activation-recomputation.md)** | Gradient checkpointing strategies | Reducing memory footprint |
-| **[CPU Offloading](cpu-offloading.md)** | Offloading to CPU for memory management | Working with limited GPU memory |
-
-### Monitoring and Debugging
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Logging](logging.md)** | Logging configuration and TensorBoard/WandB integration | Monitoring training progress |
-| **[Profiling](profiling.md)** | Performance profiling and analysis | Identifying bottlenecks |
-| **[Resiliency](resiliency.md)** | Handling failures and recovery | Building robust training pipelines |
-
-### Advanced Features
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[PEFT](peft.md)** | Parameter-Efficient Fine-Tuning (LoRA, etc.) | Fine-tuning with limited resources |
-| **[Packed Sequences](packed-sequences.md)** | Sequence packing for efficiency | Optimizing data loading |
-| **[Megatron FSDP](megatron-fsdp.md)** | Stable overview of Megatron FSDP | Choosing an FSDP path |
-| **[Distillation](distillation.md)** | Knowledge distillation techniques | Transferring knowledge between models |
-| **[Checkpointing](checkpointing.md)** | Checkpoint saving, loading, and resuming | Managing training state |
-| **[Callbacks](callbacks.md)** | Inject custom logic into training loop | Custom logging, metrics, third-party integrations |
-
-## Training Workflow
-
-A typical training workflow involves:
-
-1. **Configure Training** - Set up `ConfigContainer` with model, data, and training parameters
-2. **Prepare Data** - Configure dataset loading and preprocessing
-3. **Set Optimization** - Configure optimizer, scheduler, and mixed precision
-4. **Enable Monitoring** - Set up logging and profiling
-5. **Configure Checkpointing** - Set up checkpoint saving and resuming
-6. **Launch Training** - Start training with configured entry points
-7. **Monitor Progress** - Track metrics via logging and profiling
-8. **Resume if Needed** - Use checkpointing to resume from saved state
-
-## Related Documentation
-
-- **[Main Documentation Index](../index.md)** - Return to main documentation
-- **[Performance Guide](../performance-guide.md)** - Comprehensive performance optimization guide
-- **[Performance Summary](../performance-summary.md)** - Quick performance reference
-- **[Recipe Usage](../recipe-usage.md)** - Using training recipes
-- **[Parallelisms](../parallelisms.md)** - Understanding distributed training strategies
-- **[Bridge Guide](../bridge-guide.md)** - Working with Hugging Face models
-
-## Common Training Scenarios
-
-### 🆕 First-Time Training Setup
-
-1. [Configuration Container Overview](config-container-overview.md) - Understand the configuration system
-2. [Entry Points](entry-points.md) - Learn how to start training
-3. [Training Loop Settings](training-loop-settings.md) - Configure basic training parameters
-4. [Logging](logging.md) - Set up monitoring
-
-### ⚡ Performance Optimization
-
-1. [Performance Guide](../performance-guide.md) - Comprehensive optimization strategies
-2. [Mixed Precision](mixed-precision.md) - Enable mixed precision training
-3. [Communication Overlap](communication-overlap.md) - Optimize distributed training
-4. [Activation Recomputation](activation-recomputation.md) - Reduce memory usage
-5. [Profiling](profiling.md) - Identify bottlenecks
-
-### 💾 Production Training
-
-1. [Checkpointing](checkpointing.md) - Reliable checkpoint management
-2. [Resiliency](resiliency.md) - Handle failures gracefully
-3. [Logging](logging.md) - Comprehensive monitoring
-4. [Profiling](profiling.md) - Performance analysis
-
-### 🔧 Customization
-
-1. [PEFT](peft.md) - Parameter-efficient fine-tuning
-2. [Distillation](distillation.md) - Knowledge distillation
-3. [Entry Points](entry-points.md) - Custom training workflows
-4. [Callbacks](callbacks.md) - Inject custom logic (third-party integrations)
-
----
-
-**Ready to start training?** Begin with [Configuration Container Overview](config-container-overview.md) or return to the [main documentation](../README.md).
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/communication-overlap.md
-```md
-# Communication Overlap
-
-Communication overlap reduces exposed communication cost in distributed training
-by hiding collectives or point-to-point transfers under useful compute.
-
-This page is the stable guide for what communication overlap is, when it tends
-to help, and which boundaries are durable across Megatron Bridge. For exact
-knobs, code anchors, and verification commands, see:
-
-- `skills/perf-techniques/tp-dp-comm-overlap/SKILL.md`
-- `skills/perf-techniques/expert-parallel-overlap/SKILL.md`
-
-## What It Is
-
-In Bridge, communication overlap is a family of related techniques rather than a
-single switch:
-
-| Mode | What gets hidden | Main gate |
-|---|---|---|
-| DP | gradient reduce-scatter and parameter all-gather | distributed-optimizer overlap path |
-| TP | tensor-parallel collectives under layer compute | `CommOverlapConfig.tp_comm_overlap` plus sequence parallelism |
-| PP | pipeline send/recv work under schedule execution | pipeline schedule and virtual pipeline layout |
-| CP | context-parallel communication inside CP execution paths | CP implementation choice |
-| EP | MoE token dispatch/combine communication under expert compute | `overlap_moe_expert_parallel_comm` |
-
-These paths share the same goal, but they do not share the same enablement
-rules, evidence level, or failure modes.
-
-## What Problem It Solves
-
-Distributed training often becomes communication-bound before it becomes
-compute-bound. Once TP, DP, PP, CP, or EP traffic is visible on the critical
-path, adding more GPUs may raise communication time faster than it raises useful
-compute.
-
-Communication overlap addresses that by moving communication earlier or later in
-the step so the same transfer can happen while some other part of the model is
-already doing useful work. It does not change the training objective. It tries
-to reduce idle time.
-
-## Impacted Training Dimensions
-
-| Dimension | Effect | Confidence | Why |
-|---|---|---|---|
-| `speed` | ~0-15% faster step time, mode-dependent | medium | The whole point is to hide communication time, but gain depends strongly on which overlap mode is active and whether communication is actually exposed. EP overlap measured flat to ~13% slower on small-EP Qwen3-30B-A3B, so gains are not guaranteed. |
-| `memory` | neutral (some modes add ~1-2 GB for buffers) | low | Overlap itself is usually not a primary memory technique, although some implementations (e.g., TP userbuffers) add buffer or scheduling constraints. |
-| `scale` | positive at higher parallelism degrees | medium | Overlap becomes more valuable as communication dominates larger distributed runs. |
-| `convergence` | no change expected | medium | The intent is to preserve the same training math, though schedule changes can alter floating-point accumulation order. |
-| `stability` | adds operational constraints | medium | More overlap usually means tighter requirements around schedule shape, precision, runtime versions, and feature combinations. |
-
-## When to Use It
-
-Enable communication overlap when all of the following are mostly true:
-
-- the distributed configuration already works correctly without overlap
-- communication is a meaningful part of step time
-- you are tuning throughput or utilization, not doing first bring-up
-- you can benchmark the specific overlap mode you plan to use
-
-As a rule of thumb:
-
-| Mode | Good first use case | Recommendation |
-|---|---|---|
-| DP | distributed optimizer on multi-GPU or multi-node training | Usually worth considering early once optimizer sharding is already chosen. |
-| TP | `TP >= 2` with sequence parallelism and TE-enabled path | Benchmark when TP collectives are visible in the profile. |
-| PP | interleaved pipeline schedules where p2p overhead is visible | Treat as schedule tuning, not a blanket PP default. |
-| CP | large-context runs already using CP | Follow the CP-specific guidance rather than treating it as a separate generic knob. |
-| EP | large-scale MoE with many micro-batches and inter-node A2A cost | Most promising at larger EP and with higher-latency dispatcher backends. |
-
-Measured repo evidence today is strongest for MoE EP overlap. On
-Qwen3-30B-A3B with EP=4 and `alltoall` on 2 H100 nodes, EP overlap is
-numerically safe at GBS=8 but provides no speedup, and it is about 13% slower
-at GBS=64. On Qwen3-Next-80B-A3B with EP=8 and `alltoall` on 8 nodes, the
-overlap variants are stable while the non-overlap baseline NaNs, but
-`delay_wgrad_compute` is still about 4.8% slower than overlap-only. That makes
-EP overlap correctness-backed in this repo, but not yet broadly speedup-backed.
-
-## When Not to Use It
-
-Avoid communication overlap when any of these are true:
-
-- you are still debugging a new distributed setup
-- the profile is compute-bound rather than communication-bound
-- the required companion feature is missing, such as sequence parallelism for TP
-- another feature already imposes conflicting runtime constraints
-- you have not benchmarked the exact model and parallelism shape
-
-For MoE EP overlap specifically, avoid treating it as a default when:
-
-- `EP <= 4` with `alltoall` on `<= 2` nodes
-- the run has very few pipeline micro-batches
-- `moe_shared_expert_overlap` must stay enabled
-- full recompute or recompute scheduling incompatible with EP overlap is required
-
-## Feature Interactions
-
-The most important interactions are:
-
-- DP overlap is tied to distributed-optimizer behavior rather than a fully independent tuning path.
-- TP overlap depends on sequence parallelism and the supported TE overlap path.
-- PP and EP overlap interact with virtual pipeline layout when `PP > 1`.
-- CP overlap should be reasoned about together with the chosen CP communication type.
-- EP overlap with DeepEP or HybridEP requires explicitly switching the dispatcher to `flex`.
-- EP overlap and `moe_shared_expert_overlap` are mutually exclusive.
-- CUDA graphs plus `delay_wgrad_compute` adds extra TE-version and graph-scope restrictions.
-- Launch-time environment tuning can conflict across overlap paths, especially TP or CP overlap versus DeepEP or HybridEP tuning.
-
-## Bridge Configuration
-
-Communication overlap is configured through `CommOverlapConfig` plus
-mode-specific model settings. There is no single universal toggle — DP, TP,
-PP, CP, and EP each have different prerequisites and should be enabled based
-on the actual bottleneck.
-
-For config examples and minimal runnable commands, see:
-
-- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md)
-- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md)
-
-## Expected Metric Changes
-
-| Metric | Expected Change | Conditions | Evidence |
-|---|---|---|---|
-| `step_time` | down | DP overlap with distributed optimizer on communication-heavy runs | expected |
-| `step_time` | down | TP overlap with `TP >= 2`, sequence parallelism, and supported TE path | expected |
-| `pipeline_idle_time` | down | interleaved PP where p2p cost is visible | expected |
-| `step_time` | flat | Qwen3-30B-A3B, EP=4, `alltoall`, 2 nodes, GBS=8 | measured: 822ms baseline vs 827ms overlap |
-| `step_time` | up | same model/config, GBS=64 | measured: 4889ms baseline vs 5538ms overlap |
-| `step_time` | up | Qwen3-Next-80B-A3B, EP=8, `alltoall`, 8 nodes, `delay_wgrad_compute=True` vs overlap-only | measured: 4912ms vs 4686ms |
-
-Do not assume one overlap win transfers automatically to another mode. The
-correct question is always "which communication path is exposed in this run?"
-
-## Common Failure Modes
-
-- TP overlap silently disables itself when sequence parallelism is off or `TP < 2`.
-- PP overlap expectations are wrong when the schedule is non-interleaved or VPP is missing.
-- EP overlap asserts when `PP > 1` but `virtual_pipeline_model_parallel_size` is unset.
-- EP overlap asserts when full recompute, recompute method, or shared-expert overlap stays enabled.
-- Setting `moe_flex_dispatcher_backend` alone does not activate DeepEP or HybridEP; the dispatcher must actually switch to `flex`.
-- Small-EP `alltoall` MoE runs can get slower because scheduling overhead is larger than the communication being hidden.
-
-## Related Docs
-
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/cuda-graphs.md](cuda-graphs.md)
-- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md)
-- [skills/perf-techniques/tp-dp-comm-overlap/SKILL.md](../skills/perf-techniques/tp-dp-comm-overlap/SKILL.md)
-- [skills/perf-techniques/expert-parallel-overlap/SKILL.md](../skills/perf-techniques/expert-parallel-overlap/SKILL.md)
-- [skills/perf-techniques/moe-comm-overlap/SKILL.md](../skills/perf-techniques/moe-comm-overlap/SKILL.md)
-- [skills/perf-techniques/moe-comm-overlap/card.yaml](../skills/perf-techniques/moe-comm-overlap/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/common.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from megatron.core.distributed import DistributedDataParallelConfig
-
-from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-from megatron.bridge.peft.lora import LoRA
-from megatron.bridge.recipes.utils.finetune_utils import default_squad_config
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedInitConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-
-
-def _pretrain_common() -> ConfigContainer:
-    """Create a base pre-training ConfigContainer with common defaults for any language model.
-
-    This function returns a ConfigContainer template with sensible defaults.
-    The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` before use.
-
-    Returns:
-        ConfigContainer: Base configuration template for pre-training.
-    """
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default optimizer and scheduler
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=500,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=3e-4,
-        min_lr=3e-5,
-    )
-
-    cfg = ConfigContainer(
-        # Model - MUST be set by each recipe before use
-        model=None,  # type: ignore[arg-type]
-        # Training config
-        train=TrainingConfig(
-            train_iters=300000,
-            global_batch_size=32,
-            micro_batch_size=2,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=500,
-            eval_iters=32,
-        ),
-        # Optimizer and scheduler
-        optimizer=opt_cfg,
-        scheduler=scheduler_cfg,
-        # DDP config - these are the commonly overridden settings
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-        ),
-        # Dataset config - uses mock data by default
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            seq_length=4096,
-            num_dataset_builder_threads=1,
-            blend=None,  # Mock data mode
-            blend_per_split=None,
-            split="9999,8,2",
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        # Logger config
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        # Tokenizer - placeholder, each recipe should set tokenizer_model
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=None,  # Must be set by each recipe
-        ),
-        # Checkpoint config
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        # RNG config
-        rng=RNGConfig(seed=1234),
-        # Distributed init config
-        dist=DistributedInitConfig(),
-        comm_overlap=None,
-        # Mixed precision - bf16 by default
-        mixed_precision="bf16_mixed",
-    )
-
-    return cfg
-
-
-def _sft_common() -> ConfigContainer:
-    """Create a base SFT (Supervised Fine-Tuning) ConfigContainer with common defaults.
-
-    This function returns a ConfigContainer template with sensible defaults for full SFT
-    (not LoRA/DoRA). The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model`
-    before use.
-
-    Key differences from pre-training:
-    - Uses HFDatasetConfig with SQuAD as default dataset
-    - Lower learning rate (5e-6) suitable for full fine-tuning
-    - Fewer training iterations (1000)
-    - Smaller batch sizes
-    - Supports pretrained_checkpoint loading
-    - No PEFT (full parameter training)
-
-    Returns:
-        ConfigContainer: Base configuration template for full SFT.
-    """
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for SFT
-    seq_length = 2048
-
-    # Packed sequence is enabled by default for training efficiency
-    # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1
-    packed_sequence = True
-    pad_seq_to_mult = 1  # Override in model config if context_parallel_size > 1
-
-    # Optimizer and scheduler with lower LR for full SFT
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=50,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=5e-6,  # Lower LR for full fine-tuning
-        min_lr=0.0,
-        adam_beta2=0.98,  # Common for fine-tuning
-    )
-
-    cfg = ConfigContainer(
-        # Model - MUST be set by each recipe before use
-        model=None,  # type: ignore[arg-type]
-        # Training config - shorter training for SFT
-        train=TrainingConfig(
-            train_iters=1000,
-            global_batch_size=128,
-            micro_batch_size=1,
-        ),
-        validation=ValidationConfig(
-            eval_interval=100,
-            eval_iters=32,
-        ),
-        # Optimizer and scheduler
-        optimizer=opt_cfg,
-        scheduler=scheduler_cfg,
-        # DDP config - minimal settings, model-specific configs can override
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-        ),
-        # Dataset config - uses SQuAD with packed sequences by default
-        dataset=default_squad_config(
-            seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult
-        ),
-        # Logger config
-        logger=LoggerConfig(
-            log_interval=1,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        # Tokenizer - placeholder, each recipe should set tokenizer_model
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=None,  # Must be set by each recipe
-        ),
-        # Checkpoint config with pretrained_checkpoint support
-        checkpoint=CheckpointConfig(
-            save_interval=100,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            pretrained_checkpoint=None,  # Set to load from pretrained weights
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        # RNG config - different seed from pretrain
-        rng=RNGConfig(seed=5678),
-        # Distributed init config
-        dist=DistributedInitConfig(),
-        comm_overlap=None,
-        # Mixed precision - bf16 by default
-        mixed_precision="bf16_mixed",
-        # No PEFT for full SFT
-        peft=None,
-    )
-
-    return cfg
-
-
-def _peft_common() -> ConfigContainer:
-    """Create a base PEFT (Parameter-Efficient Fine-Tuning) ConfigContainer with LoRA defaults.
-
-    This function returns a ConfigContainer template with sensible defaults for PEFT
-    using LoRA. The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model`
-    before use.
-
-    Key differences from full SFT:
-    - Higher learning rate (1e-4) suitable for adapter training
-    - LoRA enabled by default with standard settings (dim=32, alpha=32)
-    - Targets all linear layers: linear_qkv, linear_proj, linear_fc1, linear_fc2
-
-    Returns:
-        ConfigContainer: Base configuration template for PEFT with LoRA.
-    """
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for PEFT
-    seq_length = 2048
-
-    # Packed sequence is enabled by default for training efficiency
-    # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1
-    packed_sequence = True
-    pad_seq_to_mult = 1  # Override in model config if context_parallel_size > 1
-
-    # Optimizer and scheduler with higher LR for PEFT (only training adapters)
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=50,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=1e-4,  # Higher LR for adapter training
-        min_lr=0.0,
-        adam_beta2=0.98,  # Common for fine-tuning
-    )
-
-    cfg = ConfigContainer(
-        # Model - MUST be set by each recipe before use
-        model=None,  # type: ignore[arg-type]
-        # Training config - shorter training for PEFT
-        train=TrainingConfig(
-            train_iters=1000,
-            global_batch_size=128,
-            micro_batch_size=1,
-        ),
-        validation=ValidationConfig(
-            eval_interval=100,
-            eval_iters=32,
-        ),
-        # Optimizer and scheduler
-        optimizer=opt_cfg,
-        scheduler=scheduler_cfg,
-        # DDP config - minimal settings for PEFT
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-        ),
-        # Dataset config - uses SQuAD with packed sequences by default
-        dataset=default_squad_config(
-            seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult
-        ),
-        # Logger config
-        logger=LoggerConfig(
-            log_interval=1,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        # Tokenizer - placeholder, each recipe should set tokenizer_model
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=None,  # Must be set by each recipe
-        ),
-        # Checkpoint config with pretrained_checkpoint support
-        checkpoint=CheckpointConfig(
-            save_interval=100,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            pretrained_checkpoint=None,  # Set to load from pretrained weights
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        # RNG config - different seed from pretrain
-        rng=RNGConfig(seed=5678),
-        # Distributed init config
-        dist=DistributedInitConfig(),
-        comm_overlap=None,
-        # Mixed precision - bf16 by default
-        mixed_precision="bf16_mixed",
-        # LoRA config with standard defaults
-        peft=LoRA(
-            target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-            dim=32,
-            alpha=32,
-            dropout=0.0,
-            dropout_position="pre",
-            lora_A_init_method="xavier",
-            lora_B_init_method="zero",
-            a2a_experimental=False,
-            lora_dtype=None,  # Uses model's dtype
-        ),
-    )
-
-    return cfg
-
-
-def _sft_common_vlm() -> ConfigContainer:
-    """Create a base SFT ConfigContainer with common defaults for Vision-Language Models.
-
-    This function inherits from `_sft_common()` and overrides VLM-specific settings.
-    The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use.
-
-    Key differences from LLM SFT (`_sft_common`):
-    - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2)
-    - Uses NullTokenizer (VLMs use processor instead of tokenizer)
-    - DDP config optimized for VLM training (no grad/param overlap)
-    - Supports freeze options for language_model, vision_model, vision_projection
-    - Different training defaults (train_iters=300000, GBS=32, MBS=2)
-    - Different RNG seed (1234)
-
-    Returns:
-        ConfigContainer: Base configuration template for VLM full SFT.
-    """
-    # Start from the LLM SFT common config
-    cfg = _sft_common()
-
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for VLM
-    seq_length = 4096
-
-    # VLM-specific training config - longer training with different batch sizes
-    cfg.train.train_iters = 300000
-    cfg.train.global_batch_size = 32
-    cfg.train.micro_batch_size = 2
-    cfg.train.manual_gc = True
-    cfg.train.manual_gc_interval = 100
-    cfg.train.manual_gc_eval = 100
-
-    # VLM-specific validation config
-    cfg.validation.eval_interval = 500
-    cfg.validation.eval_iters = 32
-
-    # VLM-specific optimizer settings - higher LR for VLM training
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=500,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=3e-4,
-        min_lr=3e-5,
-    )
-    cfg.optimizer = opt_cfg
-    cfg.scheduler = scheduler_cfg
-
-    # VLM-specific DDP config - no overlap for VLMs
-    cfg.ddp = DistributedDataParallelConfig(
-        check_for_nan_in_grad=True,
-        grad_reduce_in_fp32=True,
-        overlap_grad_reduce=False,
-        overlap_param_gather=False,
-        average_in_collective=True,
-        data_parallel_sharding_strategy="optim_grads_params",
-        use_distributed_optimizer=True,
-    )
-
-    # VLM-specific dataset - uses HuggingFace dataset provider
-    # hf_processor_path must be set by model-specific config
-    cfg.dataset = HFDatasetConversationProvider(
-        seq_length=seq_length,
-        hf_processor_path=None,  # Must be set by model-specific config
-        maker_name="make_cord_v2_dataset",
-        num_workers=2,
-        dataloader_type="single",
-        data_sharding=True,
-        pin_memory=True,
-        persistent_workers=False,
-        pack_sequences_in_batch=True,
-    )
-
-    # VLM uses NullTokenizer - actual tokenization is handled by the processor
-    cfg.tokenizer = TokenizerConfig(
-        tokenizer_type="NullTokenizer",
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # VLM-specific logger config
-    cfg.logger = LoggerConfig(
-        log_interval=10,
-        tensorboard_dir=tensorboard_dir,
-        log_timers_to_tensorboard=True,
-    )
-
-    # VLM-specific checkpoint config
-    cfg.checkpoint.save_interval = 500
-    cfg.checkpoint.save = checkpoint_dir
-    cfg.checkpoint.load = checkpoint_dir
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.fully_parallel_save = True
-
-    # VLM uses different RNG seed
-    cfg.rng = RNGConfig(seed=1234)
-
-    return cfg
-
-
-def _peft_common_vlm() -> ConfigContainer:
-    """Create a base PEFT ConfigContainer with LoRA defaults for Vision-Language Models.
-
-    This function inherits from `_peft_common()` and overrides VLM-specific settings.
-    The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use.
-
-    Key differences from LLM PEFT (`_peft_common`):
-    - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2)
-    - Uses NullTokenizer (VLMs use processor instead of tokenizer)
-    - DDP config optimized for VLM training (no grad/param overlap)
-    - Supports freeze options for language_model, vision_model, vision_projection
-    - Different training defaults (train_iters=300000, GBS=32, MBS=2)
-    - Different RNG seed (1234)
-    - Higher LR (1e-4) for adapter training
-
-    Returns:
-        ConfigContainer: Base configuration template for VLM PEFT with LoRA.
-    """
-    # Start from the LLM PEFT common config
-    cfg = _peft_common()
-
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for VLM
-    seq_length = 4096
-
-    # VLM-specific training config - longer training with different batch sizes
-    cfg.train.train_iters = 300000
-    cfg.train.global_batch_size = 32
-    cfg.train.micro_batch_size = 2
-    cfg.train.manual_gc = True
-    cfg.train.manual_gc_interval = 100
-    cfg.train.manual_gc_eval = 100
-
-    # VLM-specific validation config
-    cfg.validation.eval_interval = 500
-    cfg.validation.eval_iters = 32
-
-    # VLM-specific optimizer settings - higher LR for PEFT
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=500,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=1e-4,  # Higher LR for adapter training
-        min_lr=1e-5,
-    )
-    cfg.optimizer = opt_cfg
-    cfg.scheduler = scheduler_cfg
-
-    # VLM-specific DDP config - no overlap for VLMs
-    cfg.ddp = DistributedDataParallelConfig(
-        check_for_nan_in_grad=True,
-        grad_reduce_in_fp32=True,
-        overlap_grad_reduce=False,
-        overlap_param_gather=False,
-        average_in_collective=True,
-        data_parallel_sharding_strategy="optim_grads_params",
-        use_distributed_optimizer=True,
-    )
-
-    # VLM-specific dataset - uses HuggingFace dataset provider
-    # hf_processor_path must be set by model-specific config
-    cfg.dataset = HFDatasetConversationProvider(
-        seq_length=seq_length,
-        hf_processor_path=None,  # Must be set by model-specific config
-        maker_name="make_cord_v2_dataset",
-        num_workers=2,
-        dataloader_type="single",
-        data_sharding=True,
-        pin_memory=True,
-        persistent_workers=False,
-        pack_sequences_in_batch=True,
-    )
-
-    # VLM uses NullTokenizer - actual tokenization is handled by the processor
-    cfg.tokenizer = TokenizerConfig(
-        tokenizer_type="NullTokenizer",
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # VLM-specific logger config
-    cfg.logger = LoggerConfig(
-        log_interval=10,
-        tensorboard_dir=tensorboard_dir,
-        log_timers_to_tensorboard=True,
-    )
-
-    # VLM-specific checkpoint config
-    cfg.checkpoint.save_interval = 500
-    cfg.checkpoint.save = checkpoint_dir
-    cfg.checkpoint.load = checkpoint_dir
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.fully_parallel_save = True
-
-    # VLM uses different RNG seed
-    cfg.rng = RNGConfig(seed=1234)
-
-    # Keep LoRA config from _peft_common() - it's already set with standard defaults
-
-    return cfg
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/hybrid-context-parallel.md
-```md
-# Hybrid / Hierarchical Context Parallel
-
-This page covers the stable Bridge-facing meaning of hierarchical context
-parallelism, especially the `a2a+p2p` transport path and
-`hierarchical_context_parallel_sizes`.
-
-For operational setup, code anchors, and verification commands, see
-[skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md).
-
-## What It Is
-
-Context parallelism (CP) splits the input sequence across GPUs so each rank
-processes a chunk. The GPUs must communicate KV data during attention. There are
-several CP communication backends:
-
-| `cp_comm_type` | Mechanism | Async / Overlap | Constraint |
-|---|---|---|---|
-| `"p2p"` | Ring-exchange of KV chunks | Yes | None |
-| `"all_gather"` | All-gather full KV before attention | No | None |
-| `"a2a"` | All-to-all: scatter heads, gather full sequence (Ulysses-style) | N/A | **CP <= num_kv_heads** |
-| `"a2a+p2p"` | Hierarchical: a2a within inner group, p2p across outer group | Partial (p2p part) | Requires `hierarchical_context_parallel_sizes` |
-
-**HCP (`a2a+p2p`)** exists to scale CP beyond the KV head count by combining
-a2a (fast, head-parallel) on intra-node links with p2p (async,
-sequence-parallel) on inter-node links.
-
-It is important to separate this from the upstream boolean
-`hybrid_context_parallel`, which is a different feature for balancing packed or
-variable-length workloads. The two concepts should not be treated as
-interchangeable.
-
-### Why a2a is limited by KV heads
-
-a2a transposes the parallelism dimension: each rank trades its sequence chunk
-for a subset of attention heads. After the all-to-all, every rank has the
-**full sequence** but only `heads / CP` heads. This means:
-
-- `heads / CP` must be a positive integer.
-- The bottleneck is KV heads (not Q heads), because in GQA the KV heads are the
-  indivisible unit.
-- If the model has 8 KV heads, pure a2a supports at most CP=8.
-
-HCP breaks this limit by applying a2a only within a sub-group small enough to
-fit within the KV head count.
-
-## When to Use It
-
-**Use HCP when ALL of these are true:**
-
-1. You need CP larger than `num_kv_heads / TP` (pure a2a won't fit).
-2. You cannot (or don't want to) increase TP to shrink CP.
-3. Your cluster has a clear bandwidth hierarchy (e.g., NVLink intra-node >> IB
-   inter-node).
-
-**Prefer pure `a2a` when:**
-
-- You can adjust TP so that `CP <= num_kv_heads / TP`. This is simpler, avoids
-  the p2p overhead, and often yields the same throughput with better memory
-  headroom.
-
-**Prefer pure `p2p` when:**
-
-- You have very few KV heads or want maximum CP flexibility.
-- Your workload can hide the p2p latency behind compute (long sequences help).
-
-### Decision example
-
-Model: 8 KV heads. Cluster: 4 nodes x 8 GPUs. Goal: train 128K sequences.
-
-| Option | TP | CP | `cp_comm_type` | Notes |
-|---|---|---|---|---|
-| A | 1 | 16 | `a2a+p2p` with `[8,2]` | a2a intra-node (8 GPUs), p2p across 2 node-groups |
-| B | 2 | 4 | `a2a` | CP=4 <= 8 KV heads. Simpler. Often same throughput. |
-| C | 1 | 16 | `p2p` | Works but no a2a bandwidth benefit intra-node |
-
-In practice, **option B is usually preferred** -- benchmarks showed identical
-throughput to option A with more memory headroom.
-
-It should be treated as an advanced feature rather than a default recommendation.
-
-## Stable Bridge Limitation
-
-The most important Bridge-specific limitation is that hierarchical context
-parallelism is currently supported only on the MPU initialization path.
-
-In practice, that means:
-
-- `dist.use_decentralized_pg=False` is the supported Bridge path
-- the decentralized process-group path should not be assumed to materialize HCP
-  groups
-
-## Stable Constraints
-
-The durable constraints are:
-
-- `hierarchical_context_parallel_sizes` must match
-  `context_parallel_size` multiplicatively
-- the usual CP sequence-length divisibility rules still apply
-- Transformer Engine version support matters for `a2a+p2p`
-
-## Recommendation Level
-
-Use hierarchical context parallelism in Bridge only when you intentionally want
-that transport path and are prepared to validate execution-path details. It is
-not yet the kind of feature that should be presented as universally safe across
-all Bridge initialization modes.
-
-## Related Docs
-
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/communication-overlap.md](communication-overlap.md)
-- [skills/perf-techniques/hybrid-context-parallel/SKILL.md](../skills/perf-techniques/hybrid-context-parallel/SKILL.md)
-- [skills/perf-techniques/hybrid-context-parallel/card.yaml](../skills/perf-techniques/hybrid-context-parallel/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/profiling.md
-```md
-# Profiling
-
-Megatron Bridge provides built-in support for profiling training jobs using a range of performance analysis tools. These include NVIDIA Nsight Systems (Nsys) for workflow optimization, as well as PyTorch-based profilers and memory trackers to monitor performance and memory usage patterns during training.
-
-## ProfilingConfig Overview
-
-{py:class}`bridge.training.config.ProfilingConfig` is a dataclass that encapsulates profiling-related settings for training. It resides inside the overall {py:class}`bridge.training.config.ConfigContainer`, which represents the complete configuration for a training run.
-
-
-### Profiling Options
-
-The configuration supports two mutually exclusive profiling options:
-
-- **NSys profiling** (`use_nsys_profiler`)
-- **PyTorch profiling** (`use_pytorch_profiler`)
-
-You can enable one or the other, but not both at the same time.
-
-
-### Step Range and Target Ranks
-
-All profiling modes allow you to configure:
-
-- **Step range**: `profile_step_start` and `profile_step_end`
-- **Target ranks**: `profile_ranks`
-
-By default, profiling targets rank 0. You can specify multiple ranks to analyze different parts of your distributed training setup.
-
-
-### Advanced Profiling Features
-
-The configuration includes options for recording tensor shapes (`record_shapes`) and enabling memory profiling (`record_memory_history`) with a customizable output path (`memory_snapshot_path`). These features offer deeper visibility into your model’s memory consumption and tensor-level operations during training.
-
-
-## NSys Profiling
-
-NVIDIA Nsys is a system-wide performance analysis tool designed to help you tune and optimize CUDA applications. Megatron Bridge integrates with Nsys to enable profiling specific steps of your training job, making it easy to collect detailed performance data without manual instrumentation.
-
-```{note}
-NSys profiling cannot be used with the `FaultTolerancePlugin` due to implementation conflicts. If both are enabled, the framework will automatically disable NSys profiling and emit a warning.
-```
-
-### Configure NSys Profiling
-
-Enable NSys profiling by setting `use_nsys_profiler=True` in your `ProfilingConfig`. The key configuration options include:
-
-```python
-from megatron.bridge.training.config import ProfilingConfig
-
-# In your ConfigContainer setup, cfg is a ConfigContainer instance
-cfg.profiling = ProfilingConfig(
-    use_nsys_profiler=True,
-    profile_step_start=10,
-    profile_step_end=15,
-    profile_ranks=[0, 1],  # Profile first two ranks
-    record_shapes=False,   # Optional: record tensor shapes
-)
-```
-
-### Launch with NSys
-
-When using NSys profiling, launch your training script with the NSys command wrapper:
-
-```bash
-nsys profile -s none -o <profile_filepath> -t cuda,nvtx --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop python <path_to_script>
-```
-
-Replace `<profile_filepath>` with your desired output path and `<path_to_script>` with your training script. The `--capture-range=cudaProfilerApi` option ensures profiling is controlled by the framework's step range configuration.
-
-### Configure Profiling with the NeMo Run NSys Plugin
-
-Recipe users can leverage the {py:class}`bridge.recipes.run_plugins.NsysPlugin` to configure NSys profiling through NeMo Run executors. The plugin provides a convenient interface for setting up profiling without manually configuring the underlying NSys command.
-
-```python
-import nemo_run as run
-from megatron.bridge.recipes.run_plugins import NsysPlugin
-
-# Create your recipe and executor
-recipe = your_recipe_function()
-executor = run.SlurmExecutor(...)
-
-# Configure NSys profiling via plugin
-plugins = [
-    NsysPlugin(
-        profile_step_start=10,
-        profile_step_end=15,
-        profile_ranks=[0, 1],
-        nsys_trace=["nvtx", "cuda"],  # Optional: specify trace events
-        record_shapes=False,
-        nsys_gpu_metrics=False,
-    )
-]
-
-# Run with profiling enabled
-with run.Experiment("nsys_profiling_experiment") as exp:
-    exp.add(recipe, executor=executor, plugins=plugins)
-    exp.run()
-```
-
-The plugin automatically configures the NSys command line options and sets up the profiling configuration in your training job.
-
-### Analyze Results
-
-After your profiling run completes, the NSys profile files (`.nsys-rep`) will be generated. To analyze them, install [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems) from the NVIDIA Developer website, open the files in the NSys GUI, and use the timeline view to explore the performance characteristics of your training job.
-
-## PyTorch Profiler
-
-Megatron Bridge supports the built-in PyTorch profiler, which is useful for viewing profiles in TensorBoard and understanding PyTorch-level performance characteristics.
-
-### Configure PyTorch Profiler
-
-Enable PyTorch profiling by setting `use_pytorch_profiler=True` in your `ProfilingConfig`:
-
-```python
-from megatron.bridge.training.config import ProfilingConfig
-
-cfg.profiling = ProfilingConfig(
-    use_pytorch_profiler=True,
-    profile_step_start=10,
-    profile_step_end=15,
-    profile_ranks=[0],
-    record_shapes=True,    # Record tensor shapes for detailed analysis
-)
-```
-
-### Configure Profiling with the PyTorch Profiler Plugin
-
-Similar to NSys, recipe users can use the {py:class}`bridge.recipes.run_plugins.PyTorchProfilerPlugin` for convenient configuration:
-
-```python
-from megatron.bridge.recipes.run_plugins import PyTorchProfilerPlugin
-
-plugins = [
-    PyTorchProfilerPlugin(
-        profile_step_start=10,
-        profile_step_end=15,
-        profile_ranks=[0],
-        record_memory_history=True,
-        memory_snapshot_path="memory_snapshot.pickle",
-        record_shapes=True,
-    )
-]
-```
-
-## Memory Profiling
-
-Megatron Bridge provides built-in support for CUDA memory profiling to track and analyze memory usage patterns during training, including GPU memory allocation and consumption tracking.
-
-More information about the generated memory profiles can be found [here](https://pytorch.org/blog/understanding-gpu-memory-1/).
-
-### Configure Memory Profiling
-
-Enable memory profiling by setting `record_memory_history=True` in your `ProfilingConfig`. This can be used with either profiling mode:
-
-```python
-from megatron.bridge.training.config import ProfilingConfig
-
-cfg.profiling = ProfilingConfig(
-    use_pytorch_profiler=True,  # or use_nsys_profiler=True
-    profile_step_start=10,
-    profile_step_end=15,
-    profile_ranks=[0],
-    record_memory_history=True,
-    memory_snapshot_path="memory_trace.pickle",  # Customize output path
-)
-```
-
-### Analyze Memory Usage
-
-After the run completes, memory snapshots for each specified rank are saved to the designated path. Load these traces using the PyTorch Memory Viz tool to plot memory usage over time and detect bottlenecks or leaks in your training pipeline.
-
-## Optimize Profiling Accuracy
-
-Profiling adds overhead to your training job, so measured timings may be slightly higher than normal operation. For accurate profiling results, disable other intensive operations like frequent checkpointing during the profiled step range. Choose your profiling step range carefully to capture representative training behavior while minimizing the performance impact on the overall job.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/README.md
-```md
-# Training Scripts
-
-Generic launcher and training scripts that work with any GPT-based model family (e.g. Deepseek, Llama, Gemma, Qwen, GPT, etc.).
-
-## Overview
-
-These scripts provide a generic interface for training GPT-based models in Megatron Bridge:
-
-- `run_recipe.py` - Generic pretraining/finetuning for GPT- and Mamba-based models.
-- `launch_with_nemo_run.py` - NeMo-Run launcher (local or Slurm)
-- `launch_with_sbatch.sh` - Direct sbatch launcher
-
-All scripts dynamically import recipes from `megatron.bridge.recipes`, apply user-provided overrides to the configuration, then begin training.
-
-## Quick Start
-
-For the end-to-end overview of how recipes are structured, overridden, and launched, see the official [Using Recipes guide](https://docs.nvidia.com/nemo/megatron-bridge/latest/recipe-usage.html).
-
-### Pretrain (single-GPU)
-
-```bash
-uv run python run_recipe.py --recipe llama32_1b_pretrain_config
-```
-
-### Pretrain (multi-GPU)
-
-```bash
-uv run torchrun --nproc_per_node=8 run_recipe.py --recipe llama32_1b_pretrain_config
-```
-
-### Finetune
-
-```bash
-uv run torchrun --nproc_per_node=8 run_recipe.py --recipe llama32_1b_sft_config
-```
-
-## Usage with Different Models
-
-Same scripts work across all model families:
-
-```bash
-# Llama
-uv run torchrun --nproc_per_node=8 run_recipe.py --recipe llama32_1b_pretrain_config
-
-# Gemma
-uv run torchrun --nproc_per_node=8 run_recipe.py --recipe gemma3_1b_pretrain_config
-
-# Qwen
-uv run torchrun --nproc_per_node=8 run_recipe.py --recipe qwen3_8b_pretrain_config
-
-# GPT
-uv run torchrun --nproc_per_node=8 run_recipe.py --recipe gpt_126m_pretrain_config
-```
-
-## CLI Overrides
-
-Override any config field using dot notation:
-
-```bash
-uv run torchrun --nproc_per_node=8 run_recipe.py \
-    --recipe llama32_1b_pretrain_config \
-    train.train_iters=5000 \
-    optimizer.lr=0.0002 \
-    model.tensor_model_parallel_size=2
-```
-
-The first part before the dot specifies which ConfigContainer subconfig to override (e.g., `train`, `model`, `optimizer`), and the part after specifies the field.
-
-Configuration priority:
-1. CLI overrides (highest)
-2. Recipe defaults (lowest)
-
-Mode is inferred from the recipe name. If your recipe name doesn't include
-`pretrain`, `finetune`, `sft`, or `peft`, pass `--mode` explicitly.
-
-## Step Function Selection
-
-Use `--step_func` to control the step function used during training. Available options:
-
-- `gpt_step` - Text-only models (default)
-- `vlm_step` - Vision-language models
-- `llava_step` - LLaVA models
-
-```bash
-uv run torchrun --nproc_per_node=8 run_recipe.py \
-    --recipe qwen25_vl_pretrain_config \
-    --step_func vlm_step
-```
-
-## Multi-Node and Distributed Training
-
-### Option 1: NeMo-Run
-
-Prerequisites:
-
-```bash
-pip install nemo-run
-```
-
-#### Test Locally First
-
-Before launching on Slurm, test your configuration locally:
-
-```bash
-python launch_with_nemo_run.py \
-    --local \
-    --script run_recipe.py \
-    --recipe llama32_1b_pretrain_config \
-    --devices 2 \
-    --dry-run \
-    train.train_iters=10
-```
-
-This uses `LocalExecutor` with torchrun for single-node testing. Include `--dry-run` to confirm the composed nemo-run command before actually launching it.
-
-#### Launch on Slurm
-
-Once tested, scale to Slurm by removing `--local` and adding Slurm parameters:
-
-```bash
-# From the cluster (LocalTunnel)
-python launch_with_nemo_run.py \
-    --script run_recipe.py \
-    --recipe llama32_1b_pretrain_config \
-    --nodes 2 \
-    --devices 8 \
-    --partition gpu \
-    --account my_account
-
-# From your local machine (SSHTunnel)
-python launch_with_nemo_run.py \
-    --script run_recipe.py \
-    --recipe llama32_1b_pretrain_config \
-    --nodes 2 \
-    --devices 8 \
-    --partition gpu \
-    --account my_account \
-    --ssh-tunnel \
-    --host my-cluster.example.com \
-    --user myusername \
-    --remote-job-dir /home/myusername/nemo-runs
-```
-
-#### With Containers
-
-When using containers, scripts are automatically packaged using `PatternPackager`:
-
-```bash
-python launch_with_nemo_run.py \
-    --script run_recipe.py \
-    --recipe qwen3_8b_pretrain_config \
-    --nodes 4 \
-    --devices 8 \
-    --partition gpu \
-    --account my_account \
-    --container-image /path/to/container.sqsh \
-    --mount /data:/data
-```
-
-> **Note:** PatternPackager only includes `scripts/training/*.py`. Local changes in
-> `src/megatron/bridge/` stay on your workstation unless you mount the repo into
-> the container.
-
-```bash
-python launch_with_nemo_run.py \
-    --script run_recipe.py \
-    --recipe llama32_1b_pretrain_config \
-    --nodes 2 \
-    --partition gpu \
-    --account my_account \
-    --container-image /path/to/container.sqsh \
-    --mount /path/to/your/Megatron-Bridge:/opt/Megatron-Bridge \
-    train.train_iters=10
-```
-
-Mounting onto `/opt/Megatron-Bridge` shadows the container's built-in source so
-your edited `src/megatron/bridge/` files are used while packaged scripts still
-run from the container workspace.
-
-For git-based packaging:
-
-```bash
-python launch_with_nemo_run.py \
-    --script run_recipe.py \
-    --recipe llama3_8b_pretrain_config \
-    --nodes 2 \
-    --partition gpu \
-    --account my_account \
-    --container-image /path/to/container.sqsh \
-    --packager git
-```
-
-#### Fault-Tolerant Training
-
-Use the fault-tolerant launcher for better resiliency:
-
-```bash
-python launch_with_nemo_run.py \
-    --script run_recipe.py \
-    --recipe llama32_1b_pretrain_config \
-    --launcher ft \
-    --nodes 2 \
-    --partition gpu \
-    --account my_account
-```
-
-### Option 2: Direct sbatch
-
-For traditional HPC workflows without NeMo-Run, use the `launch_with_sbatch.sh` script.
-
-Edit the configuration section in `launch_with_sbatch.sh`:
-
-```bash
-# Training script to run
-TRAINING_SCRIPT="run_recipe.py"
-
-# Recipe name
-RECIPE="llama32_1b_pretrain_config"
-
-# Step function (controls the step function: gpt_step, vlm_step, or llava_step)
-STEP_TYPE="gpt_step"
-
-# Optional: CLI overrides
-CLI_OVERRIDES="train.train_iters=5000 optimizer.lr=0.0003"
-
-# Optional: Container settings
-CONTAINER_IMAGE="/path/to/container.sqsh"
-CONTAINER_MOUNTS="/data:/data /model:/model"
-```
-
-Also configure the SBATCH directives at the top of the file:
-
-```bash
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=8
-#SBATCH --gpus-per-node=8
-#SBATCH --partition=gpu
-#SBATCH --account=my_account
-#SBATCH --time=04:00:00
-```
-
-Then submit:
-
-```bash
-sbatch launch_with_sbatch.sh
-```
-
-The script automatically:
-- Sets up multi-node torchrun with correct SLURM environment variables
-- Passes recipe and CLI override arguments to the training script
-- Handles container execution (if specified)
-- Applies container mounts
-
-## Recipe Arguments
-
-Generic scripts call recipes with no arguments passed to the recipe function.
-
-All customization happens through CLI overrides after the config is built.
-
-If you need to pass arguments to the recipe constructor itself (e.g., custom parallelism at recipe build time), use model-specific examples or create a custom script.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/launch_with_nemo_run.py
-```py
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Launch Training with NeMo-Run
-
-Generic launcher for training scripts. Supports local execution and Slurm clusters.
-
-Prerequisites: Install nemo-run
-
-Usage:
-    # Test locally (single node)
-    python launch_with_nemo_run.py \
-        --local \
-        --script run_recipe.py \
-        --recipe llama32_1b_pretrain_config \
-        --devices 2
-
-    # Launch on Slurm from the cluster (LocalTunnel)
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe llama32_1b_pretrain_config \
-        --nodes 2 \
-        --partition gpu \
-        --account my_account
-
-    # Launch on Slurm from your local machine (SSHTunnel)
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe llama32_1b_sft_config \
-        --nodes 1 \
-        --partition gpu \
-        --account my_account \
-        --ssh-tunnel \
-        --host my-cluster.example.com \
-        --user myusername \
-        --remote-job-dir /home/myusername/nemo-runs
-
-    # With CLI overrides
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe gemma3_1b_pretrain_config \
-        --nodes 1 \
-        --partition gpu \
-        --account my_account \
-        train.train_iters=5000 \
-        optimizer.lr=0.0002
-
-    # With containers (uses PatternPackager by default)
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe qwen3_8b_pretrain_config \
-        --nodes 1 \
-        --partition gpu \
-        --account my_account \
-        --container-image /path/to/container.sqsh \
-        --mount /data:/data
-
-    # With custom packager (git archive)
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe llama3_8b_pretrain_config \
-        --nodes 2 \
-        --partition gpu \
-        --account my_account \
-        --container-image /path/to/container.sqsh \
-        --packager git
-
-    # With environment variables (HF token, W&B key, etc.)
-    python launch_with_nemo_run.py \
-        --script /opt/Megatron-Bridge/scripts/training/run_recipe.py \
-        --recipe llama32_1b_pretrain_config \
-        --nodes 1 \
-        --partition gpu \
-        --account my_account \
-        --container-image /path/to/container.sqsh \
-        --mount /path/to/Megatron-Bridge:/opt/Megatron-Bridge \
-        --env HF_TOKEN=your_token \
-        --env WANDB_API_KEY=your_key
-
-    # With fault-tolerant launcher
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe llama32_1b_pretrain_config \
-        --launcher ft \
-        --nodes 2 \
-        --partition gpu \
-        --account my_account
-
-    # Wait for completion and tail logs
-    python launch_with_nemo_run.py \
-        --script run_recipe.py \
-        --recipe llama32_1b_pretrain_config \
-        --nodes 1 \
-        --partition gpu \
-        --account my_account \
-        --no-detach \
-        --tail-logs
-
-Note:
-- Use --local for single-node testing with LocalExecutor
-- Use --ssh-tunnel when launching to Slurm from your local machine
-- Omit --ssh-tunnel when already on the Slurm cluster (uses LocalTunnel)
-- By default, jobs are submitted and detached (use --no-detach --tail-logs to monitor)
-- With containers, scripts are auto-packaged using PatternPackager (or use --packager git)
-- Any unknown arguments are forwarded to the training script
-- Adjust cluster-specific settings (account, partition, container paths)
-"""
-
-import argparse
-import logging
-from pathlib import Path
-
-import nemo_run as run
-
-
-logger = logging.getLogger(__name__)
-
-SCRIPT_DIR = Path(__file__).parent.resolve()
-
-
-def parse_args() -> tuple[argparse.Namespace, list[str]]:
-    """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Launch training with NeMo-Run (local or Slurm)",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--local",
-        action="store_true",
-        help="Run locally with LocalExecutor (single node). Omit for Slurm execution.",
-    )
-    parser.add_argument(
-        "--script",
-        type=str,
-        required=True,
-        help="Training script to run (e.g., run_recipe.py, pretrain_vlm.py, finetune_vlm.py)",
-    )
-    parser.add_argument(
-        "--recipe",
-        type=str,
-        required=True,
-        help="Recipe name (e.g., llama32_1b_pretrain_config)",
-    )
-    parser.add_argument(
-        "--launcher",
-        type=str,
-        default="torchrun",
-        choices=["torchrun", "ft", "default"],
-        help="Launcher to use: 'torchrun', 'ft' (fault-tolerant), or 'default' (no launcher)",
-    )
-    parser.add_argument(
-        "--devices",
-        type=int,
-        default=None,
-        help="GPUs per node. Required for --local. For Slurm, omit if cluster auto-allocates whole nodes.",
-    )
-    parser.add_argument(
-        "--nodes",
-        type=int,
-        default=1,
-        help="Number of nodes to use (Slurm only, ignored for --local)",
-    )
-    parser.add_argument(
-        "--partition",
-        type=str,
-        help="Slurm partition name (required for Slurm execution)",
-    )
-    parser.add_argument(
-        "--account",
-        type=str,
-        help="Slurm account name (required for Slurm execution)",
-    )
-    parser.add_argument(
-        "--time",
-        type=str,
-        default="04:00:00",
-        help="Job time limit",
-    )
-    parser.add_argument(
-        "--gres",
-        type=str,
-        default=None,
-        help="Slurm GRES (e.g., 'gpu:8').",
-    )
-    parser.add_argument(
-        "--ssh-tunnel",
-        action="store_true",
-        help="Use SSH tunnel (for launching from local machine). Requires --host, --user, --remote-job-dir",
-    )
-    parser.add_argument(
-        "--host",
-        type=str,
-        help="SSH host for tunnel (required if --ssh-tunnel is set)",
-    )
-    parser.add_argument(
-        "--user",
-        type=str,
-        help="SSH user for tunnel (required if --ssh-tunnel is set)",
-    )
-    parser.add_argument(
-        "--remote-job-dir",
-        type=str,
-        help="Remote directory to store job files (required if --ssh-tunnel is set)",
-    )
-    parser.add_argument(
-        "--identity",
-        type=str,
-        default=None,
-        help="Path to SSH private key for authentication",
-    )
-    parser.add_argument(
-        "--container-image",
-        type=str,
-        default=None,
-        help="Container image path (Slurm only)",
-    )
-    parser.add_argument(
-        "--mount",
-        type=str,
-        action="append",
-        default=[],
-        help="Container mounts in format host:container (can be specified multiple times)",
-    )
-    parser.add_argument(
-        "--packager",
-        type=str,
-        default="none",
-        choices=["pattern", "git", "none"],
-        help="Code packaging method: 'none' (passthrough, use mounted/accessible code), "
-        "'pattern' (package *.py files), or 'git' (git archive).",
-    )
-    parser.add_argument(
-        "--env",
-        type=str,
-        action="append",
-        default=[],
-        help="Environment variables in format KEY=VALUE (can be specified multiple times)",
-    )
-    parser.add_argument(
-        "--experiment-name",
-        type=str,
-        default="megatron_bridge_training",
-        help="Name for the experiment",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Print what would be executed without submitting the job",
-    )
-    parser.add_argument(
-        "--detach",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help="Detach from the experiment after submission (use --no-detach to wait)",
-    )
-    parser.add_argument(
-        "--tail-logs",
-        action="store_true",
-        help="Tail logs after submission (only works with --no-detach)",
-    )
-
-    args, forwarded_args = parser.parse_known_args()
-    return args, forwarded_args
-
-
-def main() -> None:
-    """Launch training using NeMo-Run."""
-    args, forwarded_args = parse_args()
-
-    # Validate arguments based on execution mode
-    if args.local:
-        # Local execution - SSH tunnel args are not used
-        if args.ssh_tunnel:
-            raise ValueError("--ssh-tunnel cannot be used with --local")
-        if args.devices is None:
-            raise ValueError("--devices is required for --local execution")
-    else:
-        # Slurm execution - require partition and account
-        if not args.partition or not args.account:
-            raise ValueError("--partition and --account are required for Slurm execution (omit --local)")
-
-        if args.ssh_tunnel:
-            if not all([args.host, args.user, args.remote_job_dir]):
-                raise ValueError("--ssh-tunnel requires --host, --user, and --remote-job-dir to be specified")
-
-    # Validate script path (skip validation for absolute paths, assuming they're container paths)
-    if Path(args.script).is_absolute():
-        # Absolute path - assume it's a container path or cluster path
-        script_path = Path(args.script)
-        task_script_path = str(script_path)
-        logger.info(f"Using absolute script path (container/cluster): {task_script_path}")
-    else:
-        # Relative path - resolve from SCRIPT_DIR and validate
-        script_path = SCRIPT_DIR / args.script
-        if not script_path.exists():
-            raise FileNotFoundError(f"Training script not found: {script_path}")
-
-    script_args = ["--recipe", args.recipe]
-    if forwarded_args:
-        script_args.extend(forwarded_args)
-
-    # Determine packager
-    if args.packager == "pattern":
-        packager = run.PatternPackager(include_pattern="*.py", relative_path=str(SCRIPT_DIR))
-        logger.info("Using PatternPackager")
-        # For pattern packager, use relative path
-        if not Path(args.script).is_absolute():
-            task_script_path = args.script
-    elif args.packager == "git":
-        packager = run.GitArchivePackager(subpath="scripts/training")
-        logger.info("Using GitArchivePackager")
-        # For git packager, use relative path
-        if not Path(args.script).is_absolute():
-            task_script_path = args.script
-    else:  # none
-        packager = run.Packager()
-        logger.info("Using passthrough packager (no packaging)")
-
-    task = run.Script(
-        path=task_script_path,
-        entrypoint="python",
-        args=script_args,
-    )
-
-    # Parse environment variables
-    env_vars = {}
-    for env_str in args.env:
-        if "=" not in env_str:
-            raise ValueError(f"Invalid env format: {env_str}. Expected KEY=VALUE")
-        key, value = env_str.split("=", 1)
-        env_vars[key] = value
-
-    if env_vars:
-        logger.info(f"Setting environment variables: {list(env_vars.keys())}")
-
-    launcher = None
-    if args.launcher == "torchrun":
-        launcher = "torchrun"
-    elif args.launcher == "ft":
-        launcher = "ft"
-        logger.debug("Using fault-tolerant launcher")
-    elif args.launcher == "default":
-        launcher = None
-
-    if args.local:
-        logger.debug("Using LocalExecutor")
-        executor = run.LocalExecutor(
-            ntasks_per_node=args.devices,
-            launcher=launcher,
-        )
-        if env_vars:
-            executor.env_vars = env_vars
-    else:
-        # Configure tunnel (SSH for remote, Local if already on cluster)
-        tunnel = None
-        if args.ssh_tunnel:
-            tunnel = run.SSHTunnel(
-                host=args.host,
-                user=args.user,
-                job_dir=args.remote_job_dir,
-                identity=args.identity,
-            )
-            logger.debug(f"Using SSH tunnel to {args.user}@{args.host}")
-        else:
-            tunnel = run.LocalTunnel()
-            logger.debug("Using LocalTunnel (running on cluster)")
-
-        # Create the Slurm executor
-        executor_kwargs = {
-            "account": args.account,
-            "partition": args.partition,
-            "nodes": args.nodes,
-            "mem": "0",
-            "exclusive": True,
-            "time": args.time,
-            "tunnel": tunnel,
-            "packager": packager,
-        }
-
-        # Add devices only if specified
-        if args.devices is not None:
-            executor_kwargs["ntasks_per_node"] = args.devices
-            executor_kwargs["gpus_per_node"] = args.devices
-
-        # Add gres only if explicitly specified
-        if args.gres:
-            executor_kwargs["gres"] = args.gres
-
-        executor = run.SlurmExecutor(**executor_kwargs)
-
-        # Configure container if specified
-        if args.container_image:
-            executor.container_image = args.container_image
-
-        # Configure mounts if specified
-        if args.mount:
-            executor.container_mounts = args.mount
-
-        # Set environment variables
-        if env_vars:
-            executor.env_vars = env_vars
-
-    # Run the experiment
-    with run.Experiment(args.experiment_name) as exp:
-        exp.add(task, executor=executor, name="training")
-
-        if args.dry_run:
-            exp.dryrun()
-        else:
-            exp.run(detach=args.detach, tail_logs=args.tail_logs)
-
-            if args.detach:
-                if args.local:
-                    logger.info("Job started locally!")
-                else:
-                    logger.info("Job submitted to Slurm!")
-                    logger.info("Use 'squeue' to check job status")
-            else:
-                logger.info("Job completed!")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format="%(message)s")
-    main()
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/README.md
-```md
-# Megatron Bridge Documentation
-
-Welcome to the Megatron Bridge documentation! This guide helps you navigate our comprehensive documentation to find exactly what you need for training, converting, and working with large language models and vision language models.
-
-## 🚀 Quick Start Paths
-
-### I want to
-
-**🏃‍♂️ Get started with model conversion**
-→ Start with [Bridge Guide](bridge-guide.md) for Hugging Face ↔ Megatron conversion
-
-**⚡ Understand parallelisms and performance**
-→ Jump to [Parallelisms Guide](parallelisms.md) and [Performance Guide](performance-guide.md)
-
-**🚀 Start training a model**
-→ See [Training Documentation](training/README.md) for comprehensive training guides
-
-**📚 Find model documentation**
-→ Browse [Supported Models](models/llm/index.md) for LLMs or [Vision Language Models](models/vlm/index.md) for VLMs
-
-**🔧 Migrate from NeMo 2 or Megatron-LM**
-→ Check [NeMo 2 Migration Guide](nemo2-migration-guide.md) or [Megatron-LM Migration Guide](megatron-lm-to-megatron-bridge.md)
-
-**📊 Use training recipes**
-→ Read [Recipe Usage](recipe-usage.md) for pre-configured training recipes
-
-**🔌 Add support for a new model**
-→ Refer to [Adding New Models](adding-new-models.md)
-
-**📋 Check version information**
-→ See [Releases Documentation](releases/README.md) for versions, changelog, and known issues
-
----
-
-## 👥 Documentation by Role
-
-### For ML Engineers & Researchers
-
-- **Start here:** [Bridge Guide](bridge-guide.md) → [Training Documentation](training/README.md)
-- **Deep dive:** [Performance Guide](performance-guide.md) → [Training Optimization Guides](training/README.md#optimization-and-performance)
-- **Model support:** [Supported Models](models/llm/index.md) → [Adding New Models](adding-new-models.md)
-
-### For Training Engineers
-
-- **Start here:** [Training Documentation](training/README.md) → [Configuration Container Overview](training/config-container-overview.md)
-- **Performance:** [Performance Guide](performance-guide.md) → [Performance Summary](performance-summary.md)
-- **Parallelisms:** [Parallelisms Guide](parallelisms.md) → [Training Optimization](training/README.md#optimization-and-performance)
-
-### For Model Developers
-
-- **Start here:** [Bridge Guide](bridge-guide.md) → [Bridge Tech Details](bridge-tech-details.md)
-- **Model support:** [Adding New Models](adding-new-models.md) → [Model Documentation](models/llm/index.md)
-- **Integration:** [Bridge RL Integration](bridge-rl-integration.md)
-
-### For DevOps & Platform Teams
-
-- **Start here:** [Releases Documentation](releases/README.md) → [Software Versions](releases/software-versions.md)
-- **Troubleshooting:** [Known Issues](releases/known-issues.md)
-- **API Reference:** [API Documentation](apidocs/index.rst)
-
----
-
-## 📚 Complete Documentation Index
-
-### Getting Started
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Bridge Guide](bridge-guide.md)** | Hugging Face ↔ Megatron conversion guide | First time converting models |
-| **[Bridge Tech Details](bridge-tech-details.md)** | Technical details of the bridge system | Understanding bridge internals |
-| **[Parallelisms Guide](parallelisms.md)** | Data and model parallelism strategies | Setting up distributed training |
-| **[Performance Summary](performance-summary.md)** | Quick performance reference | Quick performance lookup |
-| **[Performance Guide](performance-guide.md)** | Comprehensive performance optimization | Optimizing training performance |
-
-### Model Support
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Large Language Models](models/llm/index.md)** | LLM model documentation | Working with LLM models |
-| **[Vision Language Models](models/vlm/index.md)** | VLM model documentation | Working with VLM models |
-| **[Adding New Models](adding-new-models.md)** | Guide for adding model support | Extending model support |
-
-### Training and Customization
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Training Documentation](training/README.md)** | Comprehensive training guides | Setting up and customizing training |
-| **[Configuration Container Overview](training/config-container-overview.md)** | Central training configuration | Understanding training configuration |
-| **[Entry Points](training/entry-points.md)** | Training entry points and execution | Understanding training flow |
-| **[Training Loop Settings](training/training-loop-settings.md)** | Training loop parameters | Configuring training parameters |
-| **[Optimizer & Scheduler](training/optimizer-scheduler.md)** | Optimization configuration | Setting up optimizers |
-| **[Mixed Precision](training/mixed-precision.md)** | Mixed precision training | Reducing memory usage |
-| **[PEFT](training/peft.md)** | Parameter-efficient fine-tuning | Fine-tuning with limited resources |
-| **[Checkpointing](training/checkpointing.md)** | Checkpoint management | Saving and resuming training |
-| **[Logging](training/logging.md)** | Logging and monitoring | Monitoring training progress |
-| **[Profiling](training/profiling.md)** | Performance profiling | Identifying bottlenecks |
-
-### Recipes and Workflows
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Recipe Usage](recipe-usage.md)** | Using pre-configured training recipes | Quick training setup |
-| **[Bridge RL Integration](bridge-rl-integration.md)** | Reinforcement learning integration | RL training workflows |
-
-### Migration Guides
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[NeMo 2 Migration Guide](nemo2-migration-guide.md)** | Migrating from NeMo 2 | Upgrading from NeMo 2 |
-| **[Megatron-LM Migration Guide](megatron-lm-to-megatron-bridge.md)** | Migrating from Megatron-LM | Upgrading from Megatron-LM |
-
-### Reference
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[API Documentation](apidocs/index.rst)** | Complete API reference | Building integrations |
-| **[Releases Documentation](releases/README.md)** | Version history and known issues | Checking versions, troubleshooting |
-| **[Documentation Guide](documentation.md)** | Contributing to documentation | Contributing docs |
-
----
-
-## 🗺️ Common Reading Paths
-
-### 🆕 First-Time Users
-
-1. [Bridge Guide](bridge-guide.md) *(10 min - understand conversion)*
-2. [Parallelisms Guide](parallelisms.md) *(15 min - understand distributed training)*
-3. [Training Documentation](training/README.md) *(choose your training path)*
-4. [Recipe Usage](recipe-usage.md) *(5 min - use pre-configured recipes)*
-
-### 🔧 Setting Up Training
-
-1. [Training Documentation](training/README.md) *(overview of training system)*
-2. [Configuration Container Overview](training/config-container-overview.md) *(understand configuration)*
-3. [Entry Points](training/entry-points.md) *(how training starts)*
-4. [Training Loop Settings](training/training-loop-settings.md) *(configure parameters)*
-5. [Logging](training/logging.md) *(set up monitoring)*
-
-### ⚡ Performance Optimization
-
-1. [Performance Guide](performance-guide.md) *(comprehensive optimization strategies)*
-2. [Performance Summary](performance-summary.md) *(quick reference)*
-3. [Mixed Precision](training/mixed-precision.md) *(reduce memory usage)*
-4. [Communication Overlap](training/communication-overlap.md) *(optimize distributed training)*
-5. [Activation Recomputation](training/activation-recomputation.md) *(reduce memory footprint)*
-6. [Profiling](training/profiling.md) *(identify bottlenecks)*
-
-### 🔄 Model Conversion Workflow
-
-1. [Bridge Guide](bridge-guide.md) *(conversion basics)*
-2. [Bridge Tech Details](bridge-tech-details.md) *(technical details)*
-3. [Supported Models](models/llm/index.md) or [Vision Language Models](models/vlm/index.md) *(model-specific guides)*
-4. [Adding New Models](adding-new-models.md) *(extend support)*
-
-### 🔧 Customization and Extension
-
-1. [Training Documentation](training/README.md) *(training customization)*
-2. [PEFT](training/peft.md) *(parameter-efficient fine-tuning)*
-3. [Distillation](training/distillation.md) *(knowledge distillation)*
-4. [Adding New Models](adding-new-models.md) *(add model support)*
-5. [Bridge RL Integration](bridge-rl-integration.md) *(RL workflows)*
-
-### 📦 Migration Paths
-
-1. [NeMo 2 Migration Guide](nemo2-migration-guide.md) *(from NeMo 2)*
-2. [Megatron-LM Migration Guide](megatron-lm-to-megatron-bridge.md) *(from Megatron-LM)*
-3. [Training Documentation](training/README.md) *(new training system)*
-
----
-
-## 📁 Directory Structure
-
-### Main Documentation
-
-- **Guides** - Core guides for parallelisms, performance, recipes, and migration
-- **Bridge Documentation** - Hugging Face ↔ Megatron conversion guides
-- **Model Documentation** - Supported model families and architectures
-
-### Subdirectories
-
-#### [models/](models/README.md)
-
-- **[llm/](models/llm/README.md)** - Large Language Model documentation
-  - Individual model guides (Qwen, LLaMA, Mistral, etc.)
-  - Conversion examples and training recipes
-- **[vlm/](models/vlm/README.md)** - Vision Language Model documentation
-  - VLM model guides (Qwen VL, Gemma VL, etc.)
-  - Multimodal model support
-
-#### [training/](training/README.md)
-
-- **Configuration** - ConfigContainer, entry points, training loop settings
-- **Optimization** - Optimizer, scheduler, mixed precision, communication overlap
-- **Performance** - Attention optimizations, activation recomputation, CPU offloading
-- **Monitoring** - Logging, profiling, checkpointing, resiliency
-- **Advanced** - PEFT, packed sequences, distillation
-
-#### [releases/](releases/README.md)
-
-- **Software Versions** - Current versions and dependencies
-- **Changelog** - Release history and changes
-- **Known Issues** - Bugs, limitations, and workarounds
-
----
-
-## 🔗 How Documents Connect
-
-```mermaid
-graph TD
-    A[README.md<br/>Start Here] --> B[Bridge Guide<br/>Model Conversion]
-    A --> C[Training Docs<br/>Training Setup]
-    A --> D[Models<br/>Model Support]
-    
-    B --> E[Bridge Tech Details<br/>Technical Deep Dive]
-    B --> F[Supported Models<br/>Model-Specific Guides]
-    
-    C --> G[Config Container<br/>Configuration]
-    C --> H[Performance Guide<br/>Optimization]
-    C --> I[Parallelisms<br/>Distributed Training]
-    
-    G --> J[Training Loop<br/>Training Parameters]
-    G --> K[Optimizer & Scheduler<br/>Optimization Setup]
-    
-    H --> L[Mixed Precision<br/>Memory Efficiency]
-    H --> M[Communication Overlap<br/>Performance]
-    
-    I --> N[Data Parallelism<br/>DDP]
-    I --> O[Model Parallelism<br/>TP/PP/VPP]
-    
-    D --> P[LLM Models<br/>Language Models]
-    D --> Q[VLM Models<br/>Vision Language Models]
-    
-    style A fill:#e1f5fe
-    style B fill:#f3e5f5
-    style C fill:#e8f5e8
-    style D fill:#fff3e0
-    style H fill:#fce4ec
-    style I fill:#e0f2f1
-```
-
----
-
-## 🤝 Getting Help
-
-- **GitHub Issues:** [Report bugs or request features](https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues)
-- **Documentation Issues:** Found something unclear? Let us know!
-- **Community:** Join discussions and share experiences
-
----
-
-## 📖 Additional Resources
-
-- **[Examples](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/examples)** - Code examples and tutorials
-- **[Contributing Guide](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/CONTRIBUTING.md)** - How to contribute to the project
-- **[API Documentation](apidocs/index.rst)** - Complete API reference
-
----
-
-**Ready to get started?** Choose your path above or dive into the [Bridge Guide](bridge-guide.md) for model conversion! 🚀
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/performance-summary.md
-```md
-# Performance
-
-As part of the NVIDIA NeMo Framework, Megatron Bridge, provides optimal performance for training advanced generative AI models by incorporating the most recent training techniques, such as model parallelization, optimized attention mechanisms, and more, to achieve high training throughput.
-
-This page provides performance benchmarks for large language models using Megatron-Bridge across different GPU systems and configurations.
-
-## Nomenclature
-
-- **GBS**: Global Batch Size
-- **MBS**: Micro Batch Size
-- **FSDP**: Fully Sharded Data Parallel
-  - FSDP > 0: use FSDP with sharding group size = #GPUs / (TP × PP)
-  - FSDP = 0: use DDP (Distributed Data Parallel)
-- **TP**: Tensor Parallel Size
-- **PP**: Pipeline Parallel Size
-- **CP**: Context Parallel Size
-- **VP**: Virtual Pipeline Parallel Size
-- **EP**: Expert Parallel Size
-- **GA**: Number of Gradient Accumulations
-
-## Performance Metrics
-
-Performance is measured using:
-
-- **Tokens/sec/GPU**: Throughput per GPU
-- **Model TFLOP/sec/GPU**: Model floating-point operations per second per GPU
-
-## Performance Summary for Large Language Models
-
-Below are performance benchmarks for various large language models. These results were obtained using performance recipes available [here](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/scripts/performance).
-
-The performance data includes:
-
-- **Pre-training Performance**: Throughput metrics for various model sizes and architectures
-- **System Configurations**: Results across different GPU systems (DGX-GB300, DGX-GB200, DGX-B300, DGX-B200, DGX-H100)
-- **Precision Options**: Performance comparisons between different precision modes (BF16, FP8, MXFP8)
-
----
-
-## 26.02.01 NeMo Container
-
-### Pre-Training Performance
-
-#### Model: LLAMA3_70B
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 64 | NVFP4 | 256 | 2 | 8192 | 0 | 1 | 1 | 1 | n/a | n/a | 7002 | 3147 |
-| DGX-GB200 | 64 | NVFP4 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 4557 | 2047 |
-| DGX-GB300 | 64 | MXFP8 | 256 | 2 | 8192 | 0 | 1 | 4 | 1 | n/a | n/a | 4798 | 2157 |
-| DGX-GB200 | 64 | MXFP8 | 256 | 1 | 8192 | 0 | 2 | 4 | 1 | 5 | n/a | 3837 | 1724 |
-| DGX-GB300 | 64 | FP8 | 256 | 2 | 8192 | 64 | 1 | 1 | 1 | n/a | n/a | 5243 | 2353 |
-| DGX-GB200 | 64 | FP8 | 256 | 2 | 8192 | 64 | 1 | 1 | 1 | n/a | n/a | 4357 | 1956 |
-| DGX-H100 | 64 | FP8 | 256 | 1 | 8192 | 0 | 4 | 8 | 1 | 5 | n/a | 1639 | 736 |
-
-#### Model: LLAMA3.1_405B
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 8 | 1 | 4 | n/a | 1358 | 3428 |
-| DGX-GB200 | 256 | NVFP4 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 4 | n/a | 1083 | 2734 |
-| DGX-GB300 | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 2 | 8 | 2 | 4 | n/a | 949 | 2394 |
-| DGX-GB200 | 256 | MXFP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 8 | n/a | 775 | 1957 |
-| DGX-GB300 | 256 | FP8 | 1536 | 1 | 8192 | 0 | 2 | 8 | 2 | 4 | n/a | 1024 | 2585 |
-| DGX-GB200 | 256 | FP8 | 1536 | 1 | 8192 | 0 | 4 | 16 | 1 | 4 | n/a | 818 | 2063 |
-
-#### Model: DeepSeekV3
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 256 | MXFP8 | 4096 | 2 | 4096 | 0 | 1 | 2 | 1 | 8 | 32 | 4691 | 1219 |
-| DGX-GB200 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 4 | 1 | 4 | 64 | 4021 | 1046 |
-| DGX-B300 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 3099 | 806 |
-| DGX-B200 | 256 | MXFP8 | 4096 | 1 | 4096 | 0 | 1 | 16 | 1 | n/a | 8 | 2790 | 725 |
-
-#### Model: GPT OSS 120B
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 19366 | 526 |
-| DGX-GB200 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 64 | 15754 | 428 |
-| DGX-B300 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 15031 | 412 |
-| DGX-B200 | 64 | BF16 | 1280 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 13722 | 373 |
-| DGX-H100 | 64 | BF16 | 1280 | 1 | 4096 | 0 | 1 | 4 | 1 | n/a | 8 | 5984 | 163 |
-
-#### Model: Qwen3_30B_a3B
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 30411 | 700 |
-| DGX-GB200 | 8 | MXFP8 | 512 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 26373 | 607 |
-| DGX-B300 | 8 | MXFP8 | 512 | 8 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 29454 | 678 |
-| DGX-B200 | 8 | MXFP8 | 512 | 4 | 4096 | 0 | 1 | 1 | 1 | n/a | 8 | 26695 | 614 |
-| DGX-H100 | 16 | FP8 | 1024 | 1 | 4096 | 0 | 1 | 2 | 1 | 12 | 8 | 9058 | 208 |
-
-#### Model: Qwen3_235B_a22B
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 256 | MXFP8 | 8192 | 2 | 4096 | 0 | 1 | 4 | 1 | n/a | 32 | 6583 | 974 |
-| DGX-GB200 | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | n/a | 32 | 5530 | 819 |
-| DGX-B300 | 256 | MXFP8 | 8192 | 1 | 4096 | 0 | 1 | 8 | 1 | 4 | 8 | 2644 | 391 |
-| DGX-H100 | 256 | FP8 | 8192 | 1 | 4096 | 0 | 2 | 8 | 1 | 4 | 32 | 1611 | 238 |
-
-#### Model: Nemotron_3_Nano
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 8 | MXFP8 | 512 | 4 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 37664 | 839 |
-| DGX-GB200 | 8 | MXFP8 | 512 | 2 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 33934 | 756 |
-| DGX-B300 | 8 | MXFP8 | 512 | 4 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 35861 | 798 |
-| DGX-H100 | 16 | FP8 | 1024 | 1 | 8192 | 0 | 1 | 1 | 1 | n/a | 8 | 14890 | 331 |
-
-#### Model: Kimi_K2
-
-| System | #-GPUs | Precision | GBS | MBS | Sequence Length | FSDP | TP | PP | CP | VP | EP | Tokens / sec / GPU | Model TFLOP / sec / GPU |
-|--------|--------|-----------|-----|-----|-----------------|------|----|----|----|----|----|-----------------------|-------------------------|
-| DGX-GB300 | 256 | MXFP8 | 4096 | 2 | 4096 | 0 | 1 | 4 | 1 | 4 | 64 | 5072 | 1037 |
-
--  Muon optimizer was used for pre-training Kimi-K2.
-
-- In MoE training benchmarks, we force-balance the token distribution among experts and all benchmarks are token-dropless.
-
-## Archive
-
-Performance summary for past releases can be found in the [archive](performance-summary-archive.md).
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/run_recipe.py
-```py
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Generic Training Script for LLM and diffusion models
-
-This script works with any model family that uses GPT-style training
-(Llama, Gemma, Qwen, GPT, etc.) and with diffusion models (e.g. FLUX, WAN). It dynamically loads recipes and supports
-CLI overrides. The --dataset flag selects the dataset type and automatically
-infers pretrain vs finetune mode.
-
-Usage:
-    Pretrain (mock data):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_pretrain_config \\
-            --dataset llm-pretrain-mock
-
-    Pretrain (real data):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_pretrain_config \\
-            --dataset llm-pretrain \\
-            'dataset.blend=[[/data/my_dataset_text_document],null]'
-
-    Finetune (SQuAD, default):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_sft_config \\
-            --dataset llm-finetune
-
-    Finetune (GSM8K):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_sft_config \\
-            --dataset llm-finetune \\
-            dataset.dataset_name=gsm8k
-
-    Finetune (user-supplied JSONL):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_sft_config \\
-            --dataset llm-finetune-preloaded \\
-            dataset.dataset_root=/data/my_finetune_data
-
-    Diffusion pretrain:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \
-            --recipe wan_1_3B_pretrain_config \
-            --step_func wan_step \
-            dataset.path=/data/energon
-
-    Diffusion SFT (full finetuning):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \
-            --recipe wan_1_3B_sft_config \
-            --step_func wan_step
-            dataset.path=/data/energon
-
-    VLM with HF dataset:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe qwen3_vl_8b_peft_config \\
-            --dataset vlm-hf \\
-            --step_func qwen3_vl_step \\
-            dataset.maker_name=cord_v2 \\
-            dataset.hf_processor_path=Qwen/Qwen3-VL-8B-Instruct \\
-            checkpoint.pretrained_checkpoint=/path/to/checkpoint
-
-    VLM with Energon dataset:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe qwen3_vl_8b_peft_energon_config \\
-            --dataset vlm-energon \\
-            --step_func qwen3_vl_step \\
-            dataset.path=/data/energon \\
-            checkpoint.pretrained_checkpoint=/path/to/checkpoint
-
-    VLM with preloaded JSON:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe qwen3_vl_8b_peft_config \\
-            --dataset vlm-preloaded \\
-            --step_func qwen3_vl_step \\
-            dataset.train_data_path=/data/vlm_train.json \\
-            dataset.image_folder=/data/vlm_images \\
-            dataset.hf_processor_path=Qwen/Qwen3-VL-8B-Instruct \\
-            checkpoint.pretrained_checkpoint=/path/to/checkpoint
-
-    With CLI overrides (Hydra-style, works for any config field):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_pretrain_config \\
-            --dataset llm-pretrain-mock \\
-            train.train_iters=5000 \\
-            optimizer.lr=0.0003
-
-Recipe Arguments:
-    Generic scripts call recipes with no arguments: recipe().
-
-    If you need to pass arguments to the recipe constructor
-    (e.g., custom parallelism at build time), create a custom script.
-"""
-
-import argparse
-import inspect
-from typing import Callable
-
-import megatron.bridge.recipes as recipes
-
-# Diffusion forward steps: use class instances so they can be passed as forward_step_func
-from megatron.bridge.diffusion.models.flux.flux_step import FluxForwardStep
-from megatron.bridge.diffusion.models.wan.wan_step import WanForwardStep
-from megatron.bridge.models.qwen_vl.qwen3_vl_step import forward_step as qwen3_vl_forward_step
-from megatron.bridge.recipes.utils.dataset_utils import (
-    DATASET_TYPES,
-    apply_dataset_override,
-    infer_mode_from_dataset,
-)
-from megatron.bridge.training.audio_lm_step import forward_step as audio_lm_forward_step
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.finetune import finetune
-from megatron.bridge.training.gpt_step import forward_step as gpt_forward_step
-from megatron.bridge.training.llava_step import forward_step as llava_forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.utils.omegaconf_utils import process_config_with_overrides
-from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step
-
-
-STEP_FUNCTIONS: dict[str, Callable] = {
-    "audio_lm_step": audio_lm_forward_step,
-    "gpt_step": gpt_forward_step,
-    "vlm_step": vlm_forward_step,
-    "qwen3_vl_step": qwen3_vl_forward_step,
-    "llava_step": llava_forward_step,
-    "flux_step": FluxForwardStep,
-    "wan_step": WanForwardStep,
-}
-
-TRAIN_FUNCTIONS = {
-    "pretrain": pretrain,
-    "finetune": finetune,
-}
-
-ERR_UNKNOWN_STEP = "Unknown step type: {step_type}. Choose from: {choices}"
-ERR_INFER_MODE_FAILED = (
-    "Unable to infer training mode. "
-    "Pass --dataset to specify the dataset type, or include 'pretrain' or 'finetune' "
-    "(or 'sft'/'peft') in the recipe name."
-)
-
-
-def parse_args() -> tuple[argparse.Namespace, list[str]]:
-    """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Generic training script for LLM and diffusion models",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--recipe",
-        type=str,
-        required=True,
-        help="Recipe function name (e.g., llama32_1b_pretrain_config, gemma3_1b_sft_config, gemma3_1b_peft_config)",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=None,
-        choices=DATASET_TYPES,
-        help=(
-            "Dataset type. Training mode (pretrain/finetune) is inferred from this.\n"
-            "LLM datasets:\n"
-            "  llm-pretrain           GPT pretrain data (set dataset.blend=<path>)\n"
-            "  llm-pretrain-mock      Mock pretrain data for testing\n"
-            "  llm-finetune           HF finetune dataset (set dataset.dataset_name=squad|gsm8k|openmathinstruct2)\n"
-            "  llm-finetune-preloaded User-supplied JSONL (set dataset.dataset_root=<path>)\n"
-            "VLM datasets:\n"
-            "  vlm-energon            Energon multimodal (set dataset.path=<path>)\n"
-            "  vlm-hf                 HF VLM dataset (set dataset.maker_name=<name>)\n"
-            "  vlm-preloaded          User-supplied VLM JSON (set dataset.train_data_path=<path>)"
-        ),
-    )
-    parser.add_argument(
-        "--step_func",
-        type=str,
-        default="gpt_step",
-        choices=sorted(STEP_FUNCTIONS.keys()),
-        help="Step function: gpt_step (text-only), vlm_step (vision-language), llava_step (LLaVA), "
-        "flux_step (FLUX diffusion), wan_step (WAN diffusion, hyperparameters selected by --mode/recipe name)",
-    )
-    parser.add_argument(
-        "--peft_scheme",
-        type=str,
-        default=None,
-        help="PEFT scheme to use: 'lora', 'dora', or None.",
-    )
-    parser.add_argument(
-        "--packed_sequence",
-        action="store_true",
-        default=False,
-        help="Enable packed sequence training (default: False)",
-    )
-    parser.add_argument(
-        "--seq_length",
-        type=int,
-        default=None,
-        help="Sequence length for training",
-    )
-    parser.add_argument(
-        "--hf_path",
-        type=str,
-        default=None,
-        help="HuggingFace model ID or local path to model directory. "
-        "Use a local path for more stable multinode training.",
-    )
-    args, cli_overrides = parser.parse_known_args()
-    return args, cli_overrides
-
-
-def load_recipe(
-    recipe_name: str,
-    peft_scheme: str | None,
-    packed_sequence: bool = False,
-    seq_length: int | None = None,
-    hf_path: str | None = None,
-) -> ConfigContainer:
-    """
-    Load recipe by name from megatron.bridge.recipes.
-
-    Args:
-        recipe_name: Full recipe function name (e.g., 'llama32_1b_pretrain_config')
-        peft_scheme: PEFT scheme to use ('lora', 'dora', or None)
-        packed_sequence: Enable packed sequence training (default: False)
-        seq_length: Sequence length for training (optional)
-        hf_path: HuggingFace model ID or local path to model directory (optional)
-
-    Returns:
-        ConfigContainer from calling the recipe
-
-    Raises:
-        AttributeError: If recipe not found
-    """
-    if not hasattr(recipes, recipe_name):
-        raise AttributeError(
-            f"Recipe '{recipe_name}' not found in megatron.bridge.recipes.\n"
-            f"Make sure the recipe name is correct and the recipe is exported in its family __init__.py.\n"
-            f"Example recipe names: llama32_1b_pretrain_config, gemma3_1b_pretrain_config, qwen3_8b_pretrain_config"
-        )
-
-    config_builder = getattr(recipes, recipe_name)
-
-    # Inspect the recipe's signature to determine which arguments it accepts
-    try:
-        sig = inspect.signature(config_builder)
-        params = sig.parameters
-        has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
-
-        accepts_peft = "peft" in params or has_var_keyword
-        accepts_packed_sequence = "packed_sequence" in params or has_var_keyword
-        accepts_seq_length = "seq_length" in params or has_var_keyword
-        accepts_hf_path = "hf_path" in params or has_var_keyword
-    except (ValueError, TypeError):
-        # If signature inspection fails, fallback conservatively
-        accepts_peft = True  # peft is widely supported, try passing it
-        accepts_packed_sequence = False  # new parameter, don't pass if unsure
-        accepts_seq_length = False  # new parameter, don't pass if unsure
-        accepts_hf_path = False  # model-specific, don't pass if unsure
-
-    # Build kwargs dynamically based on what the recipe accepts
-    kwargs = {}
-    if accepts_peft:
-        kwargs["peft"] = peft_scheme
-    if accepts_packed_sequence and packed_sequence:
-        kwargs["packed_sequence"] = packed_sequence
-    if accepts_seq_length and seq_length is not None:
-        kwargs["seq_length"] = seq_length
-    if accepts_hf_path and hf_path is not None:
-        kwargs["hf_path"] = hf_path
-
-    try:
-        return config_builder(**kwargs)
-    except TypeError:
-        # Fallback if the kwargs are not accepted despite signature inspection
-        return config_builder()
-
-
-def load_forward_step(step_type: str, mode: str | None = None) -> Callable:
-    """Load forward_step function based on the requested step type."""
-    step_key = step_type.lower()
-    if step_key not in STEP_FUNCTIONS:
-        raise ValueError(ERR_UNKNOWN_STEP.format(step_type=step_type, choices=", ".join(STEP_FUNCTIONS)))
-    step = STEP_FUNCTIONS[step_key]
-    if inspect.isclass(step):
-        if "mode" in inspect.signature(step.__init__).parameters:
-            return step(mode=mode)
-        return step()
-    return step
-
-
-def infer_train_mode(recipe_name: str) -> str:
-    """Infer training mode from the recipe name (fallback when --dataset is not passed)."""
-    lowered = recipe_name.lower()
-    has_pretrain = "pretrain" in lowered
-    has_finetune = "finetune" in lowered or "sft" in lowered or "peft" in lowered
-    if has_pretrain ^ has_finetune:
-        return "pretrain" if has_pretrain else "finetune"
-    raise ValueError(ERR_INFER_MODE_FAILED)
-
-
-def main() -> None:
-    """Run GPT training (pretrain or finetune)."""
-    args, cli_overrides = parse_args()
-
-    config: ConfigContainer = load_recipe(
-        args.recipe,
-        args.peft_scheme,
-        args.packed_sequence,
-        args.seq_length,
-        args.hf_path,
-    )
-
-    if args.dataset is not None:
-        mode = infer_mode_from_dataset(args.dataset)
-        config = apply_dataset_override(
-            config,
-            dataset_type=args.dataset,
-            packed_sequence=args.packed_sequence,
-            seq_length=args.seq_length,
-            cli_overrides=cli_overrides,
-        )
-    else:
-        mode = infer_train_mode(args.recipe)
-
-    config = process_config_with_overrides(
-        config,
-        cli_overrides=cli_overrides or None,
-    )
-
-    # Ensure dataset.seq_length and model.seq_length stay in sync after CLI overrides
-    if (
-        hasattr(config, "model")
-        and config.model is not None
-        and hasattr(config, "dataset")
-        and config.dataset is not None
-    ):
-        if hasattr(config.dataset, "seq_length") and config.model.seq_length != config.dataset.seq_length:
-            config.model.seq_length = config.dataset.seq_length
-
-    forward_step = load_forward_step(args.step_func, mode=mode)
-    train_func = TRAIN_FUNCTIONS[mode]
-    train_func(config=config, forward_step_func=forward_step)
-
-
-if __name__ == "__main__":
-    main()
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/recipe-usage.md
-```md
-# Using Recipes
-
-Megatron Bridge provides production-ready training recipes for several popular models. You can find an overview of supported recipes and 🤗 HuggingFace bridges [here](index.md#supported-models).
-This guide will cover the next steps to make use of a training recipe, including how to [override configuration](#overriding-configuration) and how to [launch a job](#launch-methods).
-
-## Overview
-
-- **Coverage**: We provide recipes across select model families and sizes, including Llama, Qwen, DeepSeek, and Nemotron-H (Mamba-based).
-- **Defaults**: Each recipe sets defaults meant for convergence and performance across parallelisms, precision data types, and optimizer & scheduler choices. These recipes can be used as a high-quality starting point. 
-- **Integration**: Recipes return a single `ConfigContainer` that plugs directly into our training [entry points](training/entry-points.md) (see the published docs as well: https://docs.nvidia.com/nemo/megatron-bridge/latest/training/entry-points.html).
-- **Customization**: You can override any part of the recipe (Python, YAML, CLI) to adapt to your data, scale, and objectives.
-
-## Overriding configuration
-
-Recipes are provided through a {py:class}`~bridge.training.config.ConfigContainer` object. This is a dataclass that holds all configuration objects needed for training. You can find a more detailed overview of the `ConfigContainer` [here](training/config-container-overview.md).
-The benefit of providing the full recipe through a pythonic structure is that it is agnostic to any configuration approach that a user may prefer, whether that's YAML, `argparse` or something else. In other words, the user may override the recipe however they see fit.
-
-The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b.py).
-
-
-### Python
-
-If you prefer to manage configuration in Python, you can directly modify attributes of the `ConfigContainer`:
-
-```python
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-
-# Get the base ConfigContainer from the recipe
-cfg: ConfigContainer = pretrain_config()
-
-# Apply overrides. Note the hierarchical structure
-cfg.train.train_iters = 20
-cfg.train.global_batch_size = 8
-cfg.train.micro_batch_size = 1
-cfg.logger.log_interval = 1
-```
-
-You can also replace entire sub-configs of the `ConfigContainer`:
-
-```python
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-from megatron.bridge.models.llama import Llama3ModelProvider
-
-cfg: ConfigContainer = pretrain_config()
-
-small_llama = Llama3ModelProvider(
-    num_layers=2,
-    hidden_size=768,
-    ffn_hidden_size=2688,
-    num_attention_heads=16,
-)
-cfg.model = small_llama
-```
-
-### YAML
-Overriding a configuration recipe with a YAML file can be done using OmegaConf utilities:
-
-```python
-from omegaconf import OmegaConf
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-)
-
-cfg: ConfigContainer = pretrain_config()
-yaml_filepath = "conf/llama3-8b-benchmark-cfg.yaml"
-
-# Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-# excluded_fields holds some configuration that cannot be serialized into a DictConfig
-merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-# Load and merge YAML overrides
-yaml_overrides_omega = OmegaConf.load(yaml_filepath)
-merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
-
-# Apply overrides while preserving excluded fields
-final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-```
-
-The above snippet will update `cfg` with all overrides from `llama3-8b-benchmark-cfg.yaml`.
-
-### Hydra-style
-
-Megatron Bridge provides some utilities to update the ConfigContainer using Hydra-style CLI overrides:
-
-```python
-import sys
-from omegaconf import OmegaConf
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-    parse_hydra_overrides,
-)
-
-cfg: ConfigContainer = pretrain_config()
-cli_overrides = sys.argv[1:]
-
-# Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-# excluded_fields holds some configuration that cannot be serialized into a DictConfig
-merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-# Parse and merge CLI overrides
-merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
-
-# Apply overrides while preserving excluded fields
-final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-```
-
-After the above snippet, `cfg` will be updated with all CLI-provided overrides. 
-A script containing the above code could be called like so:
-
-```sh
-torchrun <torchrun arguments> pretrain_cli_overrides.py model.tensor_model_parallel_size=4 train.train_iters=100000 ...
-```
-
-## Launch methods
-
-Megatron Bridge supports launching scripts with both `torchrun` and [NeMo-Run](https://github.com/NVIDIA-NeMo/Run).
-Once your script is ready to be launched, refer to one of the following sections.
-
-### Torchrun
-Megatron Bridge training scripts can be launched with the `torchrun` command that most PyTorch users are familiar with.
-Simply specify the number of GPUs to use with `--nproc-per-node` and the number of nodes with `--nnodes`. For example, on a single node:
-
-```sh
-torchrun --nnodes 1 --nproc-per-node 8 /path/to/train/script.py <args to pretrain script>
-```
-
-For multi-node training, it is recommended to use a cluster orchestration system like SLURM.
-The `torchrun` command should be wrapped as specified by your cluster orchestration system.
-For example, with Slurm, wrap the `torchrun` command inside of `srun`:
-
-```sh
-# launch.sub
-
-srun --nodes 2 --gpus-per-node 8 \
-    --container-image <image tag> --container-mounts <mounts> \
-    bash -c "
-        torchrun --nnodes $SLURM_NNODES --nproc-per-node $SLURM_GPUS_PER_NODE /path/to/train/script.py <args to pretrain script>
-    "
-```
-
-Along with any other required flags. It is also recommended to use a NeMo Framework container with Slurm. You can find a list of container tags on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags).
-
-### NeMo-Run
-
-Megatron Bridge also supports launching training with [NeMo-Run](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html). NeMo-Run is a Python package that enables configuring and executing experiments across several platforms.
-For multi-node training, NeMo-Run will generate a script with appropriate commands, similar to the `srun` command described above.
-
-The recommended method to launch a Megatron Bridge script with NeMo-Run is through the `run.Script` API.
-You can modify the following 3 steps to your needs in a new file:
-
-```python
-import nemo_run as run
-
-if __name__ == "__main__":
-    # 1) Configure the `run.Script` object
-    train_script = run.Script(path="/path/to/train/script.py", entrypoint="python")
-
-    # 2) Define an executor for the desired target platform
-    executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun")
-
-    # 3) Execute
-    run.run(train_script, executor=executor)
-```
-
-NeMo-Run supports launching on several different platforms, including [SLURM clusters](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#slurmexecutor).
-For more details, please see the NeMo-Run [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#) for a list of supported platforms, their corresponding executors, and configuration instructions.
-
-You can also forward arguments from the NeMo-Run launch script to the target script:
-
-```python
-import nemo_run as run
-import argparse
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    ...
-    known_args, args_to_fwd = parser.parse_known_args()
-    train_script = run.Script(..., args=args_to_fwd)
-```
-
-For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b_nemo_run_script.py).
-
-#### Plugins
-
-Megatron Bridge provides several NeMo-Run plugins to simplify the usage of certain features.
-These plugins can simply be added to the `run.run()` call:
-
-```python
-import nemo_run as run
-from megatron.bridge.recipes.run_plugins import NsysPlugin
-
-if __name__ == "__main__":
-    train_script = run.Script(path="/path/to/train/script.py", entrypoint="python")
-    executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun")
-
-    plugins = [] # plugins argument expects a list
-    nsys = NsysPlugin(profile_step_start=10, profile_step_end=15, ...)
-    plugins.append(nsys)
-    run.run(train_script, plugins=plugins, executor=executor)
-```
-
-##### Custom Argument Converters
-
-By default, plugins convert their configuration to Hydra-style CLI arguments when used with `run.Script` tasks. If your training script uses a different argument format (e.g., argparse), you can provide a custom converter function via the `script_args_converter_fn` parameter.
-
-```python
-import nemo_run as run
-from typing import List
-from megatron.bridge.recipes.run_plugins import (
-    PreemptionPlugin,
-    PreemptionPluginScriptArgs,
-)
-
-# Define a custom converter for argparse-style arguments
-def argparse_preemption_converter(args: PreemptionPluginScriptArgs) -> List[str]:
-    result = []
-    if args.enable_exit_handler:
-        result.append("--enable-exit-handler")
-    if args.enable_exit_handler_for_data_loader:
-        result.append("--enable-exit-handler-dataloader")
-    return result
-
-if __name__ == "__main__":
-    train_script = run.Script(path="/path/to/train/script.py", entrypoint="python")
-    executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun")
-
-    # Use the plugin with the custom converter
-    plugin = PreemptionPlugin(
-        preempt_time=120,
-        enable_exit_handler=True,
-        script_args_converter_fn=argparse_preemption_converter,
-    )
-    run.run(train_script, plugins=[plugin], executor=executor)
-```
-
-Each plugin provides its own corresponding dataclass (e.g., `PreemptionPluginScriptArgs`, `NsysPluginScriptArgs`) that defines the available arguments for conversion.
-
-See the [API reference](#bridge.recipes.run_plugins) for a list of available NeMo-Run plugins.
-
-### Avoiding Hangs
-
-When working with any scripts in Megatron Bridge, please make sure you wrap your code in an `if __name__ == "__main__":`
-block. Otherwise, your code may hang unexpectedly.
-
-The reason for this is that Megatron Bridge uses Python's `multiprocessing` module in the backend when running a
-multi-GPU job. The multiprocessing module will create new Python processes that will import the current module (your
-script). If you did not add `__name__== "__main__"`,  then your module will spawn new processes which import the
-module and then each spawn new processes. This results in an infinite loop of process spawning.
-
-## Resources
-
-- [OmegaConf documentation](https://omegaconf.readthedocs.io/en/2.3_branch/)
-- [torchrun Documentation](https://docs.pytorch.org/docs/stable/elastic/run.html)
-- [PyTorch Multinode Training documentation](https://docs.pytorch.org/tutorials/intermediate/ddp_series_multinode.html)
-- [NeMo-Run documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html#)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/packed-sequences.md
-```md
-# Packed Sequences
-
-Packed sequences are a fine-tuning technique that reduces padding waste by
-concatenating multiple examples into one pack while preserving sequence
-boundaries for attention. In Megatron Bridge, this is primarily a supervised
-fine-tuning and PEFT optimization rather than a general pretraining feature.
-
-This page is the stable overview for what packed sequences are, when to use
-them, and which constraints are durable. For operational setup, code anchors,
-and verification commands, see [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md).
-
-## What It Is
-
-Fine-tuning datasets often contain examples with highly variable lengths. When
-those examples are batched conventionally, many tokens in each batch are just
-padding. Packed sequences reduce that waste by building longer packs from
-multiple examples and carrying boundary metadata into the attention path.
-
-In Bridge today, there are two distinct packing paths plus long-context
-enablement through context parallelism:
-
-| Path | Use case | Key config |
-|---|---|---|
-| Offline packed SFT | Text-only finetuning | `packed_sequence_specs` |
-| VLM in-batch packing | VLM finetuning | `pack_sequences_in_batch=True` |
-| Long-context (CP) | Pretrain / finetune at 16K-128K+ | `context_parallel_size > 1` |
-
-These are related but they are not the same knob. Offline packed SFT and VLM
-in-batch packing solve padding waste; long-context training primarily addresses
-activation memory and communication tradeoffs at larger sequence lengths.
-
-## When to Use It
-
-Packed sequences are a good fit when all of the following are true:
-
-- you are doing SFT, PEFT, or VLM finetuning (all three packing paths are
-  supported; see the path table above)
-- your examples have variable lengths and padding waste is significant
-- you can tolerate the micro-batch constraints of packed training
-
-Packed sequences are usually not the right answer when:
-
-- you are doing standard Megatron-style pretraining, which already concatenates
-  documents during sampling
-- you want long-context training in general, where context parallelism is often
-  the main technique
-- your model family or recipe explicitly opts out of packed-sequence support
-
-## Stable Constraints
-
-The durable constraints for packed sequences in Bridge are:
-
-- packed SFT requires `micro_batch_size == 1`
-- when context parallelism is used, sequence length must satisfy the standard
-  CP divisibility constraints
-- for fine-tuning with CP enabled, per-token loss behavior and reduction
-  settings matter
-- CUDA-graph-friendly packed metadata requires additional padding constraints
-
-Model-family support is not universal. Some families and recipe paths explicitly
-opt out of packed sequences or related packing modes.
-
-## Relationship to Long-Sequence Training
-
-Packed sequences and long-sequence training are often mentioned together because
-both affect sequence layout and memory behavior, but they solve different
-problems:
-
-- packed sequences mainly reduce padding waste in fine-tuning datasets
-- long-sequence training mainly addresses activation memory and communication
-  tradeoffs at larger sequence lengths
-
-For long-sequence training guidance, see:
-
-- `docs/performance-guide.md`
-- `docs/training/hybrid-context-parallel.md`
-
-## Practical Caveats
-
-The most stable caveats to remember are:
-
-1. Packed-sequence support is recipe- and model-family-specific.
-2. Fine-tuning sequence packing should not be assumed to work with every other
-   training feature.
-3. Packed sequences improve efficiency primarily by reducing padding waste, not
-   by replacing long-context parallelism or memory-planning techniques.
-
-## Related Docs
-
-- [docs/training/multi-token-prediction.md](multi-token-prediction.md)
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md)
-- [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md)
-- [skills/perf-techniques/sequence-packing/card.yaml](../skills/perf-techniques/sequence-packing/card.yaml)
-- [skills/perf-techniques/packed-sequences-long-context/SKILL.md](../skills/perf-techniques/packed-sequences-long-context/SKILL.md)
-- [skills/perf-techniques/packed-sequences-long-context/card.yaml](../skills/perf-techniques/packed-sequences-long-context/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/nemotron3-super.md
-```md
-# Nemotron 3 Super
-[Nemotron 3 Super](https://huggingface.co/collections/nvidia/nvidia-nemotron-v3)is a large language model (LLM) trained by NVIDIA, designed to deliver strong agentic, reasoning, and conversational capabilities. It is employs a hybrid **Latent Mixture-of-Experts (LatentMoE)** architecture, utilizing interleaved Mamba-2 and MoE layers, along with select Attention layers. Distinct from the Nano model, the Super model incorporates **Multi-Token Prediction (MTP)** layers for faster text generation and improved quality, and it is trained using **NVFP4** quantization to maximize compute efficiency. The model has **12B active parameters** and **120B parameters in total**.
-
-NeMo Megatron Bridge supports pretraining, full parameters finetuning, and LoRA finetuning this model. The finetuned model can be converted back to the 🤗 Hugging Face format for downstream evaluation.
-
-```{important}
-Please use the custom container `nvcr.io/nvidia/nemo:26.02.nemotron_3_super` when working with this model.
-
-Run all commands from `/opt/Megatron-Bridge` (e.g. `docker run -w /opt/Megatron-Bridge ...`)
-```
-
-## Getting the Latest Code
-
-For the best experience, it is recommended to use the latest code from the `super-v3` branch. There are two ways to do this:
-
-### Option 1: Update the Code Inside the Container
-
-Launch the container and update the code in-place:
-
-```bash
-# Pull the latest changes from the super-v3 branch
-cd /opt/megatron
-git pull origin super-v3
-```
-
-### Option 2: Mount the Repo from Host
-
-This approach lets you work with the code on your host machine and mount it into the container at runtime.
-
-**Step 1 — Pull the latest `super-v3` branch on the host:**
-
-```bash
-git checkout super-v3 && git pull origin super-v3
-```
-
-**Step 2 — Mount the repo when launching the container:**
-
-```bash
-MEGATRON_BRIDGE_PATH=/path/to/Megatron-Bridge  # set this to your local clone
-
-docker run --rm -it \
-  -v $MEGATRON_BRIDGE_PATH:/opt/Megatron-Bridge \
-  -w /opt/Megatron-Bridge \
-  nvcr.io/nvidia/nemo:26.02.nemotron_3_super \
-  bash
-```
-
----
-
-## Conversion with 🤗 Hugging Face
-
-### Import HF → Megatron
-To import the HF model to your desired `$MEGATRON_MODEL_PATH`, use the distributed
-conversion script because this model uses expert parallelism. The single-process
-`examples/conversion/convert_checkpoints.py` script is limited to single-GPU conversion
-without model parallelism.
-
-```bash
-HF_MODEL=/path/to/hf/model
-MEGATRON_PATH=/path/to/output/megatron/ckpt
-
-torchrun --nproc-per-node=8 examples/conversion/convert_checkpoints_multi_gpu.py import \
---hf-model $HF_MODEL \
---megatron-path $MEGATRON_PATH \
---tp 1 \
---ep 8
-```
-
-Notes:
-- The default parallelism is TP=1, EP=8 (Expert Parallel)
-- Adjust `--nproc-per-node` based on your available GPUs
-
-### Export Megatron → HF
-```bash
-HF_MODEL=/path/to/hf/model
-MEGATRON_PATH=/path/to/trained/megatron/ckpt
-OUTPUT_PATH=/path/to/output/hf/ckpt
-
-torchrun --nproc-per-node=8 examples/conversion/convert_checkpoints_multi_gpu.py export \
---hf-model $HF_MODEL \
---megatron-path $MEGATRON_PATH \
---hf-path $OUTPUT_PATH \
---tp 1 \
---ep 8
-```
-
-### Roundtrip Testing
-To verify the correctness of import/export conversions:
-
-```bash
-HF_MODEL=/path/to/hf/model
-MEGATRON_PATH=/path/to/megatron/ckpt
-
-torchrun --nproc-per-node=8 examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
---hf-model-id $HF_MODEL \
---megatron-load-path $MEGATRON_PATH \
---tp 1 \
---ep 8 \
---trust-remote-code
-```
-
-### Compare HF and Megatron Outputs
-To compare outputs between HF and Megatron models:
-
-```bash
-HF_MODEL=/path/to/hf/model
-MEGATRON_PATH=/path/to/megatron/ckpt
-
-torchrun --nproc-per-node=8 examples/conversion/compare_hf_and_megatron/compare.py \
---hf_model_path $HF_MODEL \
---megatron_model_path $MEGATRON_PATH \
---prompt "Hello who are " \
---tp 8 \
---ep 8 \
---trust_remote_code
-```
-
-## Pretraining Examples
-
-### Pretraining with Real Data
-```bash
-BLEND_PATH=/path/to/dataset/blend.json
-CHECKPOINT_DIR=/path/to/checkpoints
-
-torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_super.py \
---per-split-data-args-path=${BLEND_PATH} \
-logger.wandb_project=your_project \
-logger.wandb_entity=nvidia \
-logger.log_interval=5 \
-checkpoint.load=${CHECKPOINT_DIR} \
-checkpoint.save=${CHECKPOINT_DIR} \
-checkpoint.save_interval=100 \
-train.global_batch_size=8 \
-train.micro_batch_size=1 \
-train.train_iters=1280 \
-scheduler.lr_warmup_iters=128 \
-scheduler.lr_decay_iters=1152 \
-scheduler.lr_wsd_decay_iters=1152 \
-model.tensor_model_parallel_size=4 \
-model.context_parallel_size=1 \
-model.expert_model_parallel_size=64 \
-model.sequence_parallel=True
-```
-
-Notes:
-- **GPU Requirements**: Requires B200 GPUs for NVFP4 support. Minimum of 8 nodes (64 GPUs) required
-- The default parallelism settings are TP=4, EP=64, PP=1, CP=1 with sequence parallel enabled
-- Expert parallelism (EP) is set to 64 for the MoE architecture
-- Adjust batch sizes and iteration counts based on your training requirements
-- Make sure to set up WandB credentials if using WandB logging
-
-### Pretraining with Mock Data
-For quick testing without a dataset:
-
-```bash
-CHECKPOINT_DIR=/path/to/checkpoints
-
-torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_super.py \
-logger.wandb_project=your_project \
-logger.wandb_entity=nvidia \
-checkpoint.load=${CHECKPOINT_DIR} \
-checkpoint.save=${CHECKPOINT_DIR} \
-checkpoint.save_interval=100 \
-train.global_batch_size=128 \
-train.train_iters=100 \
-scheduler.lr_warmup_iters=10 \
-model.hybrid_override_pattern="MEME*ME" \
-model.num_layers=7
-```
-
-Notes:
-- If `BLEND_PATH` is not specified, mock dataset will be used
-- The `hybrid_override_pattern` can be used to customize the MoE layer pattern
-- Useful for debugging and testing the training pipeline
-
-
-## Finetuning Recipes
-
-### Full Parameter Fine-Tuning
-```bash
-MEGATRON_PATH=/path/to/pretrained/megatron/ckpt
-CHECKPOINT_DIR=/path/to/finetuned/checkpoints
-
-torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_super.py \
-logger.wandb_project=your_project \
-logger.wandb_entity=nvidia \
-logger.log_interval=5 \
-checkpoint.load=${CHECKPOINT_DIR} \
-checkpoint.save=${CHECKPOINT_DIR} \
-checkpoint.save_interval=50 \
-train.global_batch_size=16 \
-train.train_iters=200 \
-scheduler.lr_warmup_iters=10 \
-model.tensor_model_parallel_size=4 \
-model.sequence_parallel=True \
-checkpoint.pretrained_checkpoint=$MEGATRON_PATH
-```
-
-Notes:
-- Default parallelism TP=4, EP=8, PP=1, CP=1 with sequence parallel enabled
-- By default, the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad) dataset is used.
-- Fine-tuning requires a pretrained Megatron checkpoint, which can be obtained from the "Import HF → Megatron" section above
-- Adjust `global_batch_size` and parallelism settings based on your GPU memory and requirements
-
-
-### LoRA Fine-Tuning
-To enable LoRA fine-tuning, pass `--peft lora` to the script:
-
-```bash
-MEGATRON_PATH=/path/to/pretrained/megatron/ckpt
-CHECKPOINT_DIR=/path/to/lora/checkpoints
-
-torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_super.py \
---peft lora \
-logger.wandb_project=your_project \
-logger.wandb_entity=nvidia \
-logger.log_interval=5 \
-checkpoint.load=${CHECKPOINT_DIR} \
-checkpoint.save=${CHECKPOINT_DIR} \
-checkpoint.save_interval=100 \
-train.global_batch_size=4 \
-train.train_iters=200 \
-model.tensor_model_parallel_size=4 \
-model.context_parallel_size=2 \
-model.sequence_parallel=True \
-scheduler.lr_warmup_iters=30 \
-checkpoint.pretrained_checkpoint=$MEGATRON_PATH
-```
-
-Notes:
-- By default, the target modules are linear layers `["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"]` in the model
-- LoRA fine-tuning uses less memory and can work with smaller batch sizes
-- Consider using Context Parallel (CP) for longer sequences
-
-
-## Quantization (PTQ and QAT)
-
-```{important}
-Quantization support requires the latest code from the `super-v3` branch. See [Getting the Latest Code](#getting-the-latest-code) for instructions.
-```
-
-Nemotron 3 Super supports four quantization configurations:
-
-| Config Name | Format | Description |
-|---|---|---|
-| `mamba_moe_fp8_aggressive` | FP8 | Aggressive FP8 quantization for Mamba-MoE |
-| `mamba_moe_fp8_conservative` | FP8 | Conservative FP8 quantization for Mamba-MoE |
-| `mamba_moe_nvfp4_aggressive` | NVFP4 | Aggressive NVFP4 quantization for Mamba-MoE |
-| `mamba_moe_nvfp4_conservative` | NVFP4 | Conservative NVFP4 quantization for Mamba-MoE |
-
-Pass the desired config name via `--export-quant-cfg` to `quantize.py`.
-
-### Quantize
-```bash
-export HF_MODEL=/path/to/hf/model
-export MEGATRON_SAVE_PATH=/path/to/quantized/megatron/ckpt
-
-torchrun --nproc_per_node=8 examples/quantization/quantize.py \
-    --hf-model-id $HF_MODEL \
-    --export-quant-cfg mamba_moe_nvfp4_conservative \
-    --megatron-save-path $MEGATRON_SAVE_PATH \
-    --pp 1 \
-    --tp 8 \
-    --ep 8 \
-    --trust-remote-code
-```
-
-### Verify with PTQ Generate
-```bash
-torchrun --nproc_per_node=8 examples/quantization/ptq_generate.py \
-    --hf-model-id $HF_MODEL \
-    --megatron-load-path $MEGATRON_SAVE_PATH \
-    --pp 1 \
-    --tp 8 \
-    --ep 8 \
-    --trust-remote-code
-```
-
-Notes:
-- For multi-node setups (e.g. 2 nodes with 8× H100), increase `--pp` accordingly (e.g. `--pp 2`) and use a job scheduler like SLURM to launch across nodes.
-
-### Export Quantized Megatron Checkpoint → HF
-
-After quantization, export the Megatron checkpoint back to Hugging Face format:
-
-```bash
-HF_MODEL=/path/to/hf/model
-MEGATRON_LOAD_PATH=/path/to/quantized/megatron/ckpt
-EXPORT_DIR=/path/to/output/hf/ckpt
-
-torchrun --nproc_per_node=8 examples/quantization/export.py \
-    --hf-model-id $HF_MODEL \
-    --megatron-load-path $MEGATRON_LOAD_PATH \
-    --export-dir $EXPORT_DIR \
-    --pp 8 \
-    --dtype bfloat16 \
-    --trust-remote-code
-```
-
-### Quantization-Aware Training (QAT)
-
-After quantization, further improve model quality with QAT by continuing training from a quantized Megatron checkpoint.
-
-```bash
-MEGATRON_PATH=/path/to/quantized/megatron/ckpt
-CHECKPOINT_DIR=/path/to/qat/checkpoints
-
-torchrun --nproc-per-node=8 examples/models/nemotron_3/qat_nemotron_3_super.py \
---megatron-load-path=${MEGATRON_PATH} \
---seq-length=8192 \
---packed-sequence \
-logger.wandb_project=your_project \
-logger.wandb_entity=nvidia \
-logger.log_interval=5 \
-checkpoint.load=${CHECKPOINT_DIR} \
-checkpoint.save=${CHECKPOINT_DIR} \
-checkpoint.save_interval=50 \
-train.global_batch_size=16 \
-train.train_iters=200 \
-scheduler.lr_warmup_iters=10 \
-model.tensor_model_parallel_size=4 \
-model.sequence_parallel=True
-```
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/launch_with_sbatch.sh
-```sh
-#!/bin/bash
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#SBATCH --job-name=megatron-bridge-train
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=8
-#SBATCH --gpus-per-node=8
-#SBATCH --time=04:00:00
-#SBATCH --partition=gpu
-#SBATCH --account=my_account
-#SBATCH --output=logs/train_%j.out
-#SBATCH --error=logs/train_%j.err
-#SBATCH --exclusive
-
-# ==============================================================================
-# Direct Slurm Launch with sbatch (Alternative to NeMo-Run)
-#
-# This script demonstrates how to launch generic training scripts directly
-# using sbatch without NeMo-Run. This is useful for traditional HPC workflows.
-#
-# Usage:
-#   1. Modify the #SBATCH directives above for your cluster
-#   2. Set the configuration variables below
-#   3. Submit: sbatch launch_with_sbatch.sh
-#
-# For NeMo-Run based launching (recommended for remote management), see
-# launch_with_nemo_run.py
-# ==============================================================================
-
-# ==============================================================================
-# CONFIGURATION - Modify these for your setup
-# ==============================================================================
-
-# Training script to run
-TRAINING_SCRIPT="run_recipe.py"
-# Options:
-# TRAINING_SCRIPT="run_recipe.py"
-# TRAINING_SCRIPT="pretrain_vlm.py"  # For VLM models
-# TRAINING_SCRIPT="finetune_vlm.py"  # For VLM finetuning
-
-# Recipe name (must match a recipe function from megatron.bridge.recipes)
-RECIPE="llama32_1b_pretrain_config"
-# Examples:
-# RECIPE="gemma3_1b_pretrain_config"
-# RECIPE="qwen3_8b_sft_config"
-# RECIPE="llama3_8b_pretrain_config"
-# RECIPE="qwen25_vl_pretrain_config"  # For VLM models
-
-# Forward step type (gpt or vlm)
-STEP_TYPE="gpt"
-
-# Optional: CLI overrides (Hydra-style dot notation)
-CLI_OVERRIDES=""
-# CLI_OVERRIDES="train.train_iters=1000 train.global_batch_size=512 optimizer.lr=0.0002"
-
-# Container image (required)
-CONTAINER_IMAGE=""
-# CONTAINER_IMAGE="/path/to/container.sqsh"
-
-# Container mounts (optional, space-separated)
-CONTAINER_MOUNTS=""
-# CONTAINER_MOUNTS="/data:/data /model:/model"
-
-# ==============================================================================
-# Environment Setup
-# ==============================================================================
-
-# Set common environment variables
-export TORCH_NCCL_AVOID_RECORD_STREAMS=1
-export NCCL_NVLS_ENABLE=0
-
-# Authentication tokens (uncomment and set your tokens)
-# export HF_TOKEN="hf_your_token_here"
-# export WANDB_API_KEY="your_wandb_key_here"
-
-# Optional: Uncomment if needed
-# export CUDA_DEVICE_MAX_CONNECTIONS=1
-# export NCCL_DEBUG=INFO
-
-# ==============================================================================
-# Job Execution
-# ==============================================================================
-
-echo "======================================"
-echo "Megatron Bridge Training Job"
-echo "======================================"
-echo "Job ID: $SLURM_JOB_ID"
-echo "Nodes: $SLURM_JOB_NUM_NODES"
-echo "GPUs per node: $SLURM_GPUS_PER_NODE"
-echo "Script: $TRAINING_SCRIPT"
-echo "Recipe: $RECIPE"
-if [ -n "$HF_TOKEN" ]; then
-    echo "HF_TOKEN: Set"
-fi
-if [ -n "$WANDB_API_KEY" ]; then
-    echo "WANDB_API_KEY: Set"
-fi
-echo "======================================"
-
-# Determine script path
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-SCRIPT_PATH="${SCRIPT_DIR}/${TRAINING_SCRIPT}"
-
-if [ ! -f "$SCRIPT_PATH" ]; then
-    echo "ERROR: Training script not found: $SCRIPT_PATH"
-    exit 1
-fi
-
-# Build torchrun command
-CMD="torchrun"
-CMD="$CMD --nproc_per_node=$SLURM_GPUS_PER_NODE"
-CMD="$CMD --nnodes=$SLURM_JOB_NUM_NODES"
-CMD="$CMD --node_rank=\$SLURM_PROCID"
-CMD="$CMD --master_addr=\$(scontrol show hostname \$SLURM_NODELIST | head -n1)"
-CMD="$CMD --master_port=29500"
-CMD="$CMD $SCRIPT_PATH"
-CMD="$CMD --recipe $RECIPE"
-CMD="$CMD --step $STEP_TYPE"
-
-# Add CLI overrides if specified
-if [ -n "$CLI_OVERRIDES" ]; then
-    CMD="$CMD $CLI_OVERRIDES"
-fi
-
-echo "Executing: $CMD"
-echo "======================================"
-
-# Require container image
-if [ -z "$CONTAINER_IMAGE" ]; then
-    echo "ERROR: CONTAINER_IMAGE must be set. Please use a valid container image."
-    exit 1
-fi
-
-# Build srun command (always containerized)
-SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
-
-# Add container mounts
-if [ -n "$CONTAINER_MOUNTS" ]; then
-    for mount in $CONTAINER_MOUNTS; do
-        SRUN_CMD="$SRUN_CMD --container-mounts=$mount"
-    done
-fi
-
-$SRUN_CMD bash -c "$CMD"
-
-echo "======================================"
-echo "Job completed"
-echo "======================================"
-
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/canonical_lora.png
-```png
-[Binary file]
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/optimizer-scheduler.md
-```md
-# Optimizer and Scheduler Configuration
-
-The optimizer and scheduler configurations control optimization algorithms, learning rate schedules, and weight decay strategies.
-
-## OptimizerConfig (from Megatron Core)
-
-The `OptimizerConfig` contains all parameters for the optimization algorithm and comes directly from Megatron Core. Key parameters include:
-
-| Parameter | Type | Description |
-|-----------|------|-------------|
-| `optimizer` | `str` | Optimizer type ("adam", "sgd", etc.) |
-| `lr` | `float` | Base learning rate |
-| `min_lr` | `float` | Minimum learning rate for decay schedules |
-| `weight_decay` | `float` | L2 regularization coefficient |
-| `adam_beta1` | `float` | Adam optimizer beta1 parameter |
-| `adam_beta2` | `float` | Adam optimizer beta2 parameter |
-| `adam_eps` | `float` | Adam optimizer epsilon parameter |
-| `clip_grad` | `float` | Gradient clipping threshold |
-| `use_distributed_optimizer` | `bool` | Enable distributed optimizer for memory efficiency |
-| `overlap_grad_reduce` | `bool` | Overlap gradient reduction with computation |
-| `overlap_param_gather` | `bool` | Overlap parameter gathering with computation |
-| `bf16` | `bool` | Use BF16 precision for training |
-| `fp16` | `bool` | Use FP16 precision for training |
-
-## SchedulerConfig
-
-The `SchedulerConfig` controls learning rate scheduling and weight decay progression throughout training.
-
-### Learning Rate Scheduling
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `lr_decay_style` | `Literal["constant", "linear", "cosine", "inverse-square-root", "WSD"]` | `"linear"` | Learning rate decay function |
-| `lr_decay_iters` | `Optional[int]` | `None` | Iterations to decay LR over (defaults to `train_iters`). Use for iteration-based training. |
-| `lr_decay_samples` | `Optional[int]` | `None` | Samples to decay LR over (defaults to `train_samples`). Use for sample-based training. |
-| `lr_warmup_iters` | `int` | `0` | Iterations to linearly warmup learning rate. Use for iteration-based training. |
-| `lr_warmup_samples` | `int` | `0` | Samples to linearly warmup learning rate. Use for sample-based training. |
-| `lr_warmup_fraction` | `Optional[float]` | `None` | Fraction of decay iterations/samples to use for warmup (works with both modes) |
-| `lr_warmup_init` | `float` | `0.0` | Initial learning rate for warmup phase |
-
-**Scheduler Mode Selection**
-
-The scheduler supports two modes that must align with your training configuration:
-
-1. **Iteration-based scheduling**: Use `lr_decay_iters` and `lr_warmup_iters` with `train_iters`.
-2. **Sample-based scheduling**: Use `lr_decay_samples` and `lr_warmup_samples` with `train_samples`.
-
-**Important constraints**
-- Cannot mix iteration-based and sample-based scheduler parameters.
-- Your scheduler mode must match your training mode (iteration-based vs sample-based).
-- `lr_warmup_fraction` is compatible with both modes but cannot be used with explicit warmup iterations/samples.
-
-### WSD (Warmup-Stable-Decay) Scheduling
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `lr_wsd_decay_style` | `Literal["exponential", "linear", "cosine"]` | `"exponential"` | Decay style for WSD annealing phase |
-| `lr_wsd_decay_iters` | `Optional[int]` | `None` | Iterations for WSD annealing phase. Use for iteration-based training. |
-| `lr_wsd_decay_samples` | `Optional[int]` | `None` | Samples for WSD annealing phase. Use for sample-based training. |
-
-### Weight Decay Scheduling
-
-Parameters for controlling the progression of weight decay during training, including start and end values and the scheduling strategy:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `start_weight_decay` | `Optional[float]` | `None` | Initial weight decay coefficient |
-| `end_weight_decay` | `Optional[float]` | `None` | Final weight decay coefficient |
-| `weight_decay_incr_style` | `Literal["constant", "linear", "cosine"]` | `"constant"` | Weight decay progression style |
-
-### Checkpoint Integration
-
-Parameters for managing how scheduler settings are applied during checkpoint loading, allowing control over whether to prioritize config values or restore from saved state:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `override_opt_param_scheduler` | `bool` | `False` | Reset scheduler values from config, ignoring checkpoint |
-| `use_checkpoint_opt_param_scheduler` | `bool` | `False` | Use scheduler values from checkpoint, ignoring config |
-
-### Computed Fields
-
-These fields are automatically calculated during configuration validation and help align training schedules with the configured batch size and iteration counts:
-
-| Field | Description |
-|-------|-------------|
-| `lr_warmup_steps` | Total steps for warmup (calculated from iterations and batch size) |
-| `lr_decay_steps` | Total steps for decay (calculated from iterations and batch size) |
-| `wd_incr_steps` | Total steps for weight decay progression |
-| `wsd_decay_steps` | Total steps for WSD annealing phase |
-
-## Learning Rate Schedules
-
-The following scheduling strategies define how the learning rate evolves during training, each suited to different convergence behaviors and model types:
-| Schedule Type           | Description                                                                 |
-|-------------------------|-----------------------------------------------------------------------------|
-| **Constant**            | Learning rate remains fixed throughout training.                            |
-| **Linear**              | Learning rate decreases linearly from the base LR to the minimum LR.        |
-| **Cosine**              | Learning rate follows a cosine decay curve from base LR to minimum LR.      |
-| **Inverse Square Root** | Learning rate decays proportionally to the inverse square root of the step. |
-
-## WSD (Warmup-Stable-Decay)
-The WSD schedule divides learning rate progression into three distinct phases, offering fine-grained control over early ramp-up, mid-training stability, and final decay:
-| Phase     | Description                                              |
-|-----------|----------------------------------------------------------|
-| **Warmup** | Learning rate increases linearly from initial value to base LR. |
-| **Stable** | Learning rate remains constant at base LR.              |
-| **Decay**  | Learning rate decays to minimum LR using a specified style (e.g., exponential, linear, cosine). |
-
-## Weight Decay Scheduling
-
-These scheduling options control how the weight decay coefficient changes over time, allowing for regularization strategies that adapt to different training phases:
-| Schedule Type | Description                                                                 |
-|---------------|-----------------------------------------------------------------------------|
-| **Constant**  | Fixed weight decay throughout training.                                     |
-| **Linear**    | Linear progression from start to end weight decay.                          |
-| **Cosine**    | Cosine progression from start to end weight decay.                          |
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/nano/slurm_pretrain.sh
-```sh
-#!/bin/bash
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ==============================================================================
-# Nemotron 3 Nano Pretraining
-#
-# Nemotron 3 Nano is a 30B parameter model with A3B (Active 3 Billion) architecture
-# Supports multiple parallelism configs: each "TP,PP,EP,CP,SP" runs sequentially.
-#
-# Usage:
-#   1. Modify the #SBATCH directives below for your cluster
-#   2. Set CONTAINER_IMAGE to your container path
-#   3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled)
-#   4. Submit: sbatch slurm_pretrain.sh
-# ==============================================================================
-
-#SBATCH --job-name=nemotron3-pretrain
-#SBATCH --nodes=4
-#SBATCH --ntasks-per-node=8
-#SBATCH --gpus-per-node=8
-#SBATCH --time=24:00:00
-#SBATCH --partition=gpu
-#SBATCH --account=my_account
-#SBATCH --output=logs/nemotron3_pretrain_%j.out
-#SBATCH --error=logs/nemotron3_pretrain_%j.err
-#SBATCH --exclusive
-
-# ==============================================================================
-# CONFIGURATION
-# ==============================================================================
-
-# Workspace directory for checkpoints and results
-WORKSPACE=${WORKSPACE:-/workspace}
-
-# Model and training configurations
-MODEL_NAME=nemotron_3_nano
-DATASET_NAME=mock
-SEQ_LENGTH=512
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=32
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
-LR_WARMUP_ITERS=5
-LOG_INTERVAL=1
-WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
-
-# Parallelism configs: "TP,PP,EP,CP,SP" per entry
-PARALLELISM_CONFIGS=("4,1,8,1,True" "2,2,8,1,True" "2,1,8,2,True")
-
-# Container image (required)
-CONTAINER_IMAGE=""
-# CONTAINER_IMAGE="/path/to/container.sqsh"
-
-# Container mounts (optional, space-separated)
-CONTAINER_MOUNTS=""
-# CONTAINER_MOUNTS="/data:/data /workspace:/workspace"
-
-# ==============================================================================
-# Environment Setup
-# ==============================================================================
-
-# NCCL optimizations for large-scale training
-export TORCH_NCCL_AVOID_RECORD_STREAMS=1
-export NCCL_NVLS_ENABLE=0
-
-# UV cache on shared filesystem (recommended for multi-node setups)
-# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync
-# export UV_CACHE_DIR="/path/to/shared/uv_cache"
-
-# HuggingFace cache directory (recommended for shared filesystem)
-# export HF_HOME="/path/to/shared/HF_HOME"
-
-# Authentication tokens (set these for your environment)
-# export HF_TOKEN=
-# export WANDB_API_KEY=
-
-# ==============================================================================
-# Job Execution
-# ==============================================================================
-
-echo "======================================"
-echo "Nemotron 3 Nano Pretraining Job"
-echo "======================================"
-echo "Job ID: $SLURM_JOB_ID"
-echo "Nodes: $SLURM_JOB_NUM_NODES"
-echo "GPUs per node: $SLURM_GPUS_PER_NODE"
-echo "Model: $MODEL_NAME"
-echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}"
-echo "======================================"
-
-# Create logs directory if it doesn't exist
-mkdir -p logs
-
-# Require container image
-if [ -z "$CONTAINER_IMAGE" ]; then
-    echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image."
-    exit 1
-fi
-
-# Build srun command (shared across configs)
-SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
-if [ -n "$CONTAINER_MOUNTS" ]; then
-    SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS"
-fi
-echo "SRUN base: $SRUN_CMD"
-echo "======================================"
-
-# Run each parallelism config in sequence
-CONFIG_INDEX=0
-for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do
-    IFS=',' read -r TP PP EP CP SP <<< "$CONFIG"
-    CONFIG_INDEX=$((CONFIG_INDEX + 1))
-    echo ""
-    echo "======================================"
-    echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP"
-    echo "======================================"
-
-    # Build CLI overrides for this config
-    CLI_OVERRIDES="\
-        model.seq_length=$SEQ_LENGTH \
-        train.train_iters=$TRAIN_ITERS \
-        train.global_batch_size=$GLOBAL_BATCH_SIZE \
-        train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
-        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
-        logger.log_interval=$LOG_INTERVAL \
-        logger.wandb_project=$WANDB_PROJECT \
-        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
-        dataset.sequence_length=$SEQ_LENGTH \
-        model.tensor_model_parallel_size=$TP \
-        model.pipeline_model_parallel_size=$PP \
-        model.expert_model_parallel_size=$EP \
-        model.sequence_parallel=$SP \
-        model.context_parallel_size=$CP"
-
-    CMD="uv run --no-sync python scripts/training/run_recipe.py"
-    CMD="$CMD --recipe ${MODEL_NAME}_pretrain_config"
-    CMD="$CMD $CLI_OVERRIDES"
-
-    echo "Executing command..."
-    echo $CMD
-    echo "======================================"
-
-    $SRUN_CMD bash -c "$CMD"
-    RUN_EXIT=$?
-    if [ $RUN_EXIT -ne 0 ]; then
-        echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT"
-        exit $RUN_EXIT
-    fi
-done
-
-echo "======================================"
-echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)"
-echo "======================================"
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/activation-recomputation-example-2.jpg
-```jpg
-[Binary file]
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/gpt_oss/README.md
-```md
-# GPT-OSS Examples
-
-This directory contains example scripts for GPT-OSS 20B language models.
-
-For model introduction and architecture details, see the GPT-OSS documentation.
-
-## Workspace Configuration
-
-All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it:
-
-```bash
-export WORKSPACE=/your/custom/path
-```
-
-Directory structure:
-- `${WORKSPACE}/models/` - Converted checkpoints
-- `${WORKSPACE}/results/` - Training outputs and experiment results
-
-## Checkpoint Conversion
-
-See the [conversion.sh](conversion.sh) script for checkpoint conversion examples.
-
-- **Import**: Use `openai/gpt-oss-20b` as the source Hugging Face model.
-- **Export**: Use `unsloth/gpt-oss-20b-BF16` as the reference HF model for export because the exported Megatron checkpoint is unquantized (bf16), which matches that repo's format.
-
-### Import HF → Megatron
-
-To import the HF model to your desired Megatron path:
-
-```bash
-python examples/conversion/convert_checkpoints.py import \
-    --hf-model openai/gpt-oss-20b \
-    --megatron-path ${WORKSPACE}/models/gpt-oss-20b \
-    --trust-remote-code
-```
-
-### Export Megatron → HF
-
-The export uses `unsloth/gpt-oss-20b-BF16` as the reference so the saved HF checkpoint matches that unquantized format:
-
-```bash
-python examples/conversion/convert_checkpoints.py export \
-    --hf-model unsloth/gpt-oss-20b-BF16 \
-    --megatron-path ${WORKSPACE}/models/gpt-oss-20b/iter_0000000 \
-    --hf-path ${WORKSPACE}/models/gpt-oss-20b-hf-export
-```
-
-### Round-trip Validation
-
-Multi-GPU round-trip validation between formats:
-
-```bash
-python -m torch.distributed.run --nproc_per_node=8 \
-    examples/conversion/hf_megatron_roundtrip_multi_gpu.py \
-    --hf-model-id unsloth/gpt-oss-20b-BF16 \
-    --megatron-load-path ${WORKSPACE}/models/gpt-oss-20b/iter_0000000 \
-    --tp 2 --pp 2 \
-    --trust-remote-code
-```
-
-## Training Recipes
-
-- See: [bridge.recipes.gpt_oss](../../../src/megatron/bridge/recipes/gpt_oss/gpt_oss.py)
-- Available recipes:
-  - `gpt_oss_20b_pretrain_config`: Pretraining configuration for 20B
-  - `gpt_oss_20b_pretrain_fp8_current_scaling_config`: Pretraining configuration for 20B with Hopper FP8 current scaling
-  - `gpt_oss_20b_sft_config`: Full SFT configuration for 20B
-  - `gpt_oss_20b_sft_fp8_current_scaling_config`: Full SFT configuration for 20B with Hopper FP8 current scaling
-  - `gpt_oss_20b_peft_config`: LoRA PEFT configuration for 20B
-  - `gpt_oss_20b_peft_fp8_current_scaling_config`: LoRA PEFT configuration for 20B with Hopper FP8 current scaling
-  - `gpt_oss_20b_pretrain_mxfp8_config`: Pretraining configuration for 20B with Blackwell MXFP8
-  - `gpt_oss_20b_sft_mxfp8_config`: Full SFT configuration for 20B with Blackwell MXFP8
-  - `gpt_oss_20b_peft_mxfp8_config`: LoRA PEFT configuration for 20B with Blackwell MXFP8
-  - `gpt_oss_120b_pretrain_config`: Pretraining configuration for 120B
-  - `gpt_oss_120b_sft_config`: Full SFT configuration for 120B
-  - `gpt_oss_120b_peft_config`: LoRA PEFT configuration for 120B
-
-Before training, ensure the following are configured:
-1. **Container Image**: Set `CONTAINER_IMAGE` in the SLURM scripts to your container path
-2. **Container Mounts**: (optional) Set `CONTAINER_MOUNTS` for data and workspace directories
-3. **Environment Variables**:
-   - `HF_TOKEN`: to download models from HF Hub (if required)
-   - `HF_HOME`: (optional) to avoid re-downloading models and datasets
-   - `WANDB_API_KEY`: (optional) to enable WandB logging
-
-All training scripts use SLURM for containerized multi-node training.
-
-### FP8 Training (Hopper GPUs)
-
-The FP8 current scaling recipes enable mixed-precision training with FP8 on Hopper GPUs. To use an FP8 recipe, uncomment the FP8 `RECIPE_NAME` line in the corresponding SLURM script:
-
-- [slurm_pretrain.sh](slurm_pretrain.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_pretrain_fp8_current_scaling_config"`
-- [slurm_sft.sh](slurm_sft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_sft_fp8_current_scaling_config"`
-- [slurm_peft.sh](slurm_peft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_peft_fp8_current_scaling_config"`
-
-### MXFP8 Training (Blackwell GPUs)
-
-MXFP8 (`bf16_with_mxfp8_mixed`) enables mixed-precision training on Blackwell GPUs. To use an MXFP8 recipe, uncomment the MXFP8 `RECIPE_NAME` line in the corresponding SLURM script:
-
-- [slurm_pretrain.sh](slurm_pretrain.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_pretrain_mxfp8_config"`
-- [slurm_sft.sh](slurm_sft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_sft_mxfp8_config"`
-- [slurm_peft.sh](slurm_peft.sh): uncomment `RECIPE_NAME="${MODEL_NAME}_peft_mxfp8_config"`
-
-> **Note**: For GB200 nodes (4 GPUs/node), also update `--gpus-per-node` and `--ntasks-per-node` to 4 in the SBATCH directives.
-
-### Pretrain
-
-Pretrain uses the **DCLM** dataset by default when `DCLM_DATA_DIR` and `DCLM_CACHE` are set (see [slurm_pretrain.sh](slurm_pretrain.sh)). A single random DCLM shard was used for testing.
-
-To use your own preprocessed DCLM data, set the dataset config as follows (e.g. in the recipe or via overrides):
-
-```python
-cfg.dataset.blend = [
-    [f"/path/to/dclm/preprocessed/dclm_{i:02d}_text_document" for i in range(1, 11)],
-    None,
-]
-cfg.dataset.split = "9999,8,2"
-cfg.dataset.path_to_cache = "/path/to/cache"
-```
-
-Preprocess your data using the [DCLM data preprocessing tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/data/dclm).
-
-### Supervised Fine-Tuning (SFT)
-
-See the [slurm_sft.sh](slurm_sft.sh) script for full parameter fine-tuning. The recipe uses sequence packing by default.
-
-### Parameter-Efficient Fine-Tuning (PEFT) with LoRA
-
-See the [slurm_peft.sh](slurm_peft.sh) script for LoRA fine-tuning. The recipe uses sequence packing by default.
-
-### Expected Training Dynamics
-We provide a [Weights & Biases report](https://api.wandb.ai/links/nvidia-nemo-fw-public/xs3rmk4t) for the expected loss curves and grad norms.
-
-## Inference
-
-See [inference.sh](inference.sh) for text generation with:
-- Hugging Face checkpoint (`unsloth/gpt-oss-20b-BF16`)
-- Imported Megatron checkpoint (after [conversion.sh](conversion.sh) import)
-- Exported HF checkpoint (after conversion export)
-- **SFT (finetuned) checkpoint**: set `SFT_CHECKPOINT` to your [slurm_sft.sh](slurm_sft.sh) result dir and run:
-
-```bash
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/conversion/hf_to_megatron_generate_text.py \
-    --hf_model_path unsloth/gpt-oss-20b-BF16 \
-    --megatron_model_path ${WORKSPACE}/results/gpt_oss_20b_finetune_tp2_pp2_ep4_spTrue_cp1 \
-    --prompt "Hello, how are you?" \
-    --max_new_tokens 64 \
-    --tp 2 --pp 2 --ep 2 --etp 1 \
-    --trust-remote-code
-```
-
-TP×PP×EP must equal `--nproc_per_node`. Adjust parallelism to match your SFT run.
-
-## Evaluation
-
-Coming soon.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/utils/omegaconf_utils.py
-```py
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utilities for working with OmegaConf and dataclass configurations."""
-
-import dataclasses
-import functools
-import inspect
-import logging
-from enum import Enum
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, TypeVar
-
-import torch
-from hydra._internal.config_loader_impl import ConfigLoaderImpl
-from hydra.core.override_parser.overrides_parser import OverridesParser
-from omegaconf import DictConfig, OmegaConf
-
-# Re-export so existing callers (e.g. transformer_config.py) keep working.
-from megatron.bridge.utils.activation_map import callable_to_str, str_to_callable  # noqa: F401
-
-
-logger = logging.getLogger(__name__)
-
-DataclassInstance = TypeVar("DataclassInstance")
-
-# Sentinel object to distinguish between "exclude this field" and "field is legitimately None"
-_EXCLUDE_FIELD = object()
-
-# Fields whose callables should be serialized as strings (not excluded)
-_SERIALIZABLE_CALLABLE_FIELDS: frozenset[str] = frozenset({"activation_func"})
-
-
-def create_omegaconf_dict_config(config_container: Any) -> Tuple[DictConfig, Dict[str, Any]]:
-    """Create OmegaConf while tracking excluded fields for later restoration.
-
-    This function combines the conversion to OmegaConf with tracking of excluded
-    callable fields, allowing them to be restored after override processing.
-
-    Args:
-        config_container: The dataclass instance to convert
-
-    Returns:
-        Tuple of (OmegaConf DictConfig, excluded fields dictionary)
-
-    Raises:
-        ValueError: If the conversion fails
-    """
-    logger.debug("Starting safe OmegaConf conversion with callable preservation...")
-
-    # Track all callable fields that will be excluded
-    excluded_callables = _track_excluded_fields(config_container, "root")
-    logger.debug(f"Found {len(excluded_callables)} callable fields to preserve")
-
-    # Convert to OmegaConf (excluding callables)
-    base_dict = _dataclass_to_omegaconf_dict(config_container, "root")
-
-    if base_dict is _EXCLUDE_FIELD:
-        raise ValueError("Root configuration object was excluded (likely a callable)")
-
-    # Verify no callables remain
-    if not _verify_no_callables(base_dict, "root"):
-        raise ValueError("Callable objects found in converted dictionary")
-
-    # Create OmegaConf
-    omega_conf = OmegaConf.create(base_dict)
-
-    return omega_conf, excluded_callables
-
-
-def apply_overrides(
-    config_obj: DataclassInstance, overrides_dict: Dict[str, Any], excluded_fields: Dict[str, Any]
-) -> None:
-    """Apply overrides while preserving excluded callable fields.
-
-    This function first applies the overrides using the standard recursive approach,
-    then restores the callable fields that were excluded during OmegaConf conversion.
-
-    Args:
-        config_obj: The dataclass instance to modify
-        overrides_dict: Dictionary of override values to apply
-        excluded_fields: Dictionary of excluded callable fields to restore
-    """
-    # Apply normal overrides
-    _apply_overrides(config_obj, overrides_dict)
-
-    # Restore excluded fields
-    _restore_excluded_fields(config_obj, excluded_fields)
-
-    logger.debug("Configuration updated with overrides and excluded fields preserved")
-
-
-def process_config_with_overrides(
-    config: DataclassInstance,
-    config_filepath: str | None = None,
-    cli_overrides: list[str] | None = None,
-) -> DataclassInstance:
-    """Process a configuration object with optional YAML file and CLI overrides.
-
-    This function provides a unified way to:
-    1. Convert the config to OmegaConf while preserving callable fields
-    2. Merge an optional YAML configuration file
-    3. Apply optional CLI overrides using Hydra syntax
-    4. Apply the final configuration back to the original object
-
-    Args:
-        config: The dataclass configuration instance to process
-        config_filepath: Optional path to a YAML config file to merge
-        cli_overrides: Optional list of Hydra-style CLI override strings
-
-    Returns:
-        The modified configuration object with all overrides applied
-
-    Raises:
-        FileNotFoundError: If the specified config_filepath does not exist
-        OverridesError: If there's an error parsing CLI overrides
-
-    Example:
-        >>> config = load_recipe("llama3_8b")
-        >>> config = process_config_with_overrides(
-        ...     config,
-        ...     config_filepath="my_config.yaml",
-        ...     cli_overrides=["model_config.hidden_size=4096", "training_config.lr=1e-4"]
-        ... )
-    """
-    # Convert config to OmegaConf, tracking excluded callable fields
-    omega_conf, excluded_fields = create_omegaconf_dict_config(config)
-
-    # Merge YAML config file if provided
-    if config_filepath:
-        config_filepath = Path(config_filepath)
-        if not config_filepath.exists():
-            raise FileNotFoundError(f"Config file not found: {config_filepath}")
-
-        yaml_conf = OmegaConf.load(config_filepath)
-        omega_conf = OmegaConf.merge(omega_conf, yaml_conf)
-        logger.debug(f"Merged configuration from {config_filepath}")
-
-    # Apply CLI overrides if provided
-    if cli_overrides:
-        omega_conf = parse_hydra_overrides(omega_conf, cli_overrides)
-        logger.debug(f"Applied {len(cli_overrides)} CLI overrides")
-
-    # Convert back to dict and apply to original config object
-    final_config_dict = OmegaConf.to_container(omega_conf, resolve=True)
-    apply_overrides(config, final_config_dict, excluded_fields)
-
-    return config
-
-
-def parse_hydra_overrides(cfg: DictConfig, overrides: List[str]) -> DictConfig:
-    """Parse and apply Hydra overrides to an OmegaConf config.
-
-    This function uses Hydra's override parser to support advanced override syntax
-    including additions (+), deletions (~), and complex nested operations.
-
-    Args:
-        cfg: OmegaConf config to apply overrides to
-        overrides: List of Hydra override strings
-
-    Returns:
-        Updated config with overrides applied
-
-    Raises:
-        OverridesError: If there's an error parsing or applying overrides
-    """
-    try:
-        OmegaConf.set_struct(cfg, True)
-        parser = OverridesParser.create()
-        parsed = parser.parse_overrides(overrides=overrides)
-        ConfigLoaderImpl._apply_overrides_to_config(overrides=parsed, cfg=cfg)
-        return cfg
-    except Exception as e:
-        raise OverridesError(f"Failed to parse Hydra overrides: {str(e)}") from e
-
-
-class OverridesError(Exception):
-    """Custom exception for Hydra override parsing errors."""
-
-    pass
-
-
-def _is_omegaconf_problematic(val: Any) -> bool:
-    """Check if a value is a callable that OmegaConf cannot handle.
-
-    OmegaConf cannot serialize function objects, methods, or partial functions.
-    This function identifies such problematic callables while allowing class types.
-
-    Args:
-        val: The value to check
-
-    Returns:
-        True if the value is a problematic callable, False otherwise
-    """
-    if val is None:
-        return False
-
-    # Allow classes/types
-    if isinstance(val, type):
-        return False
-
-    # Block function objects, methods, partial functions, etc.
-    if callable(val) or (
-        hasattr(val, "__call__")
-        and (hasattr(val, "__module__") or hasattr(val, "__qualname__") or isinstance(val, functools.partial))
-    ):
-        return True
-
-    # Block arbitrary objects that are not dataclasses or safe primitives
-    if not isinstance(
-        val, (int, float, bool, str, list, tuple, dict, Path, Enum, torch.dtype)
-    ) and not dataclasses.is_dataclass(val):
-        return True
-
-    return False
-
-
-def _dataclass_to_omegaconf_dict(val_to_convert: Any, path: str = "") -> Any:
-    """Recursively convert a dataclass instance to a dictionary suitable for OmegaConf.create.
-
-    This function completely excludes problematic callable objects to prevent OmegaConf errors.
-    It handles dataclasses, lists, tuples, dictionaries, and primitive types, while converting
-    torch.dtype objects to strings for serialization.
-
-    Args:
-        val_to_convert: The value to convert
-        path: Current path for debugging (e.g., "model_config.activation_func")
-
-    Returns:
-        Converted value suitable for OmegaConf, or _EXCLUDE_FIELD for excluded callables
-    """
-    current_path = path
-
-    # Handle Hugging Face GenerationConfig / PretrainedConfig by converting to a callable dict
-    # compatible with our YAML representer logic
-    try:
-        from transformers import GenerationConfig, PretrainedConfig  # type: ignore
-
-        if isinstance(val_to_convert, (GenerationConfig, PretrainedConfig)):
-            cfg_class = val_to_convert.__class__
-            target = f"{inspect.getmodule(cfg_class).__name__}.{cfg_class.__qualname__}.from_dict"
-            logger.debug(f"Converting {cfg_class.__qualname__} at {current_path} to callable dict")
-            return {
-                "_target_": target,
-                "_call_": True,
-                "config_dict": val_to_convert.to_dict(),
-            }
-    except ModuleNotFoundError:
-        # transformers is optional; if unavailable, fall through to other handlers
-        pass
-
-    # Explicitly handle torch.dtype - convert to string
-    if isinstance(val_to_convert, torch.dtype):
-        logger.debug(f"Converting torch.dtype at {current_path}: {val_to_convert}")
-        return str(val_to_convert)
-
-    # Handle callables — serialize known activation functions as strings,
-    # exclude everything else.
-    if _is_omegaconf_problematic(val_to_convert):
-        field_name = current_path.rsplit(".", 1)[-1] if "." in current_path else current_path
-        if field_name in _SERIALIZABLE_CALLABLE_FIELDS:
-            str_name = callable_to_str(val_to_convert)
-            if str_name is not None:
-                logger.debug(f"Serializing callable at {current_path} as string: {str_name}")
-                return str_name
-        logger.debug(f"Excluding callable at {current_path}: {type(val_to_convert)} - {val_to_convert}")
-        return _EXCLUDE_FIELD
-
-    # Handle dataclasses
-    elif dataclasses.is_dataclass(val_to_convert) and not isinstance(val_to_convert, type):
-        res = {}
-        for field in dataclasses.fields(val_to_convert):
-            field_name = field.name
-            field_path = f"{current_path}.{field_name}" if current_path else field_name
-
-            try:
-                field_value = getattr(val_to_convert, field_name)
-                converted_value = _dataclass_to_omegaconf_dict(field_value, field_path)
-
-                # Only exclude fields marked with sentinel (not legitimate None values)
-                if converted_value is not _EXCLUDE_FIELD:
-                    res[field_name] = converted_value
-                else:
-                    logger.debug(f"Excluded field {field_path}")
-
-            except (AttributeError, TypeError) as e:
-                # Only catch specific exceptions from field access
-                logger.warning(f"Error processing field {field_path}: {e}")
-                continue
-
-        return res
-
-    # Handle lists
-    elif isinstance(val_to_convert, list):
-        result = []
-        for i, item in enumerate(val_to_convert):
-            item_path = f"{current_path}[{i}]"
-            converted_item = _dataclass_to_omegaconf_dict(item, item_path)
-
-            # Only exclude items marked with sentinel (not legitimate None values)
-            if converted_item is not _EXCLUDE_FIELD:
-                result.append(converted_item)
-
-        return result
-
-    # Handle tuples
-    elif isinstance(val_to_convert, tuple):
-        converted_items = []
-        for i, item in enumerate(val_to_convert):
-            item_path = f"{current_path}[{i}]"
-            converted_item = _dataclass_to_omegaconf_dict(item, item_path)
-
-            # Only exclude items marked with sentinel (not legitimate None values)
-            if converted_item is not _EXCLUDE_FIELD:
-                converted_items.append(converted_item)
-
-        return tuple(converted_items)
-
-    # Handle dictionaries
-    elif isinstance(val_to_convert, dict):
-        result = {}
-        for key, value in val_to_convert.items():
-            key_path = f"{current_path}.{key}" if current_path else str(key)
-            converted_value = _dataclass_to_omegaconf_dict(value, key_path)
-
-            # Only exclude values marked with sentinel (not legitimate None values)
-            if converted_value is not _EXCLUDE_FIELD:
-                result[key] = converted_value
-
-        return result
-
-    # Return primitive types as-is (including legitimate None values)
-    else:
-        return val_to_convert
-
-
-def _track_excluded_fields(obj: Any, path: str = "") -> Dict[str, Any]:
-    """Track all excluded callable fields and their original values.
-
-    This function recursively traverses a dataclass structure and builds a mapping
-    of field paths to their original callable values that will be excluded during
-    OmegaConf conversion.
-
-    Args:
-        obj: The object to analyze for callable fields
-        path: Current path prefix for building field paths
-
-    Returns:
-        Dictionary mapping field paths to their original callable values
-    """
-    excluded_fields = {}
-
-    if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
-        for field in dataclasses.fields(obj):
-            field_name = field.name
-            field_path = f"{path}.{field_name}" if path else field_name
-            field_value = getattr(obj, field_name)
-
-            if _is_omegaconf_problematic(field_value):
-                # Skip fields that are serialized as strings (not excluded)
-                if field_name in _SERIALIZABLE_CALLABLE_FIELDS and callable_to_str(field_value) is not None:
-                    logger.debug(f"Skipping serializable callable (not excluded): {field_path}")
-                else:
-                    excluded_fields[field_path] = field_value
-                    logger.debug(f"Tracking excluded callable: {field_path}")
-            elif dataclasses.is_dataclass(field_value):
-                nested_excluded = _track_excluded_fields(field_value, field_path)
-                excluded_fields.update(nested_excluded)
-            elif isinstance(field_value, dict):
-                for key, value in field_value.items():
-                    if _is_omegaconf_problematic(value):
-                        excluded_fields[f"{field_path}.{key}"] = value
-
-    return excluded_fields
-
-
-def _restore_excluded_fields(config_obj: Any, excluded_fields: Dict[str, Any]) -> None:
-    """Restore excluded callable fields to their original values.
-
-    After applying overrides from OmegaConf, this function restores the callable
-    fields that were excluded during the conversion process.
-
-    Args:
-        config_obj: The configuration object to restore fields on
-        excluded_fields: Dictionary mapping field paths to their original values
-    """
-    for field_path, original_value in excluded_fields.items():
-        try:
-            # Navigate to the parent object and field name
-            path_parts = field_path.split(".")
-            if path_parts[0] == "root":
-                path_parts = path_parts[1:]  # Remove "root" prefix
-
-            current_obj = config_obj
-
-            # Navigate to the parent object
-            for part in path_parts[:-1]:
-                current_obj = getattr(current_obj, part)
-
-            field_name = path_parts[-1]
-
-            # Restore the original callable
-            setattr(current_obj, field_name, original_value)
-            logger.debug(f"Restored callable field: {field_path}")
-
-        except (AttributeError, TypeError) as e:
-            logger.warning(f"Failed to restore callable field {field_path}: {e}")
-
-
-def _verify_no_callables(obj: Any, path: str = "") -> bool:
-    """Recursively verify that no callable objects remain in the converted structure.
-
-    This function is used for validation to ensure that all problematic callables
-    have been successfully excluded from a data structure before OmegaConf conversion.
-
-    Args:
-        obj: The object to verify
-        path: Current path for error reporting
-
-    Returns:
-        True if no problematic callables are found, False otherwise
-    """
-    if _is_omegaconf_problematic(obj):
-        logger.error(f"Found problematic callable at {path}: {obj}")
-        return False
-
-    elif isinstance(obj, dict):
-        for key, value in obj.items():
-            key_path = f"{path}.{key}" if path else str(key)
-            if not _verify_no_callables(value, key_path):
-                return False
-
-    elif isinstance(obj, (list, tuple)):
-        for i, item in enumerate(obj):
-            item_path = f"{path}[{i}]"
-            if not _verify_no_callables(item, item_path):
-                return False
-
-    return True
-
-
-def _apply_overrides(config_obj: DataclassInstance, overrides_dict: Dict[str, Any]) -> None:
-    """Recursively apply overrides from a Python dictionary to a dataclass instance.
-
-    This function traverses nested dataclass structures and applies override values
-    from a dictionary. It handles type conversions for special cases like torch.dtype.
-    It also handles dictionaries with _target_ fields by instantiating them properly.
-
-    Args:
-        config_obj: The dataclass instance to modify
-        overrides_dict: Dictionary of override values to apply
-    """
-    if not dataclasses.is_dataclass(config_obj):
-        logger.debug(f"Skipping apply_overrides for non-dataclass config_obj: {type(config_obj)}")
-        return
-
-    for key, value in overrides_dict.items():
-        if not hasattr(config_obj, key):
-            logger.warning(
-                f"Key '{key}' in overrides not found in config object {type(config_obj).__name__}. Skipping."
-            )
-            continue
-
-        current_attr = getattr(config_obj, key)
-
-        # Handle dictionaries with _target_ fields
-        if isinstance(value, dict) and "_target_" in value:
-            try:
-                from megatron.bridge.utils.instantiate_utils import instantiate
-
-                instantiated_obj = instantiate(value)
-                setattr(config_obj, key, instantiated_obj)
-                logger.debug(f"Successfully instantiated {key} from _target_: {value['_target_']}")
-                continue
-            except Exception as e:
-                logger.warning(f"Failed to instantiate {key} from _target_: {e}")
-
-        # Handle nested dataclass structures
-        if dataclasses.is_dataclass(current_attr) and isinstance(value, dict):
-            _apply_overrides(current_attr, value)
-        else:
-            try:
-                # Handle special case conversions if needed
-                final_value = value
-
-                # If the original was a torch.dtype and value is a string, convert back
-                if isinstance(current_attr, torch.dtype) and isinstance(value, str):
-                    from megatron.bridge.utils.activation_map import str_to_dtype
-
-                    try:
-                        final_value = str_to_dtype(value)
-                    except ValueError:
-                        logger.warning(f"Could not convert string '{value}' back to torch.dtype")
-                        final_value = value
-
-                # Restore serialized callable fields (e.g. "relu" → F.relu)
-                if key in _SERIALIZABLE_CALLABLE_FIELDS and isinstance(final_value, str):
-                    try:
-                        final_value = str_to_callable(final_value)
-                    except ValueError:
-                        logger.warning(f"Could not restore callable for {key}='{final_value}'; keeping string")
-
-                setattr(config_obj, key, final_value)
-                logger.debug(f"Set {type(config_obj).__name__}.{key} = {final_value}")
-
-            except Exception as e:
-                logger.warning(
-                    f"Could not set attribute {type(config_obj).__name__}.{key} to value '{value}'. Error: {e}"
-                )
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/nano/pretrain_nemotron_3_nano.py
-```py
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import os
-import sys
-from typing import Tuple
-
-import torch
-from omegaconf import OmegaConf
-
-from megatron.bridge.recipes.nemotronh.nemotron_3_nano import (
-    nemotron_3_nano_pretrain_config as pretrain_config,
-)
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-    parse_hydra_overrides,
-)
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
-    """Parse command line arguments, separating known script args from OmegaConf overrides."""
-    parser = argparse.ArgumentParser(
-        description="Pretrain Nemotron 3 Nano model using Megatron-Bridge with YAML and CLI overrides",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        help="Path to the YAML OmegaConf override file.",
-    )
-    parser.add_argument("--per-split-data-args-path", type=str, help="Path to the per split data args file.")
-
-    # Parse known args for the script, remaining will be treated as overrides
-    args, cli_dotlist_overrides = parser.parse_known_args()
-    return args, cli_dotlist_overrides
-
-
-def main() -> None:
-    """
-    Entry point for the Nemotron 3 Nano pretraining script.
-    """
-    args, cli_overrides = parse_cli_args()
-
-    cfg: ConfigContainer = pretrain_config(
-        per_split_data_args_path=args.per_split_data_args_path,
-    )
-
-    # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-    # Load and merge YAML overrides if a config file is provided
-    if args.config_file:
-        logger.debug(f"Loading YAML overrides from: {args.config_file}")
-        if not os.path.exists(args.config_file):
-            logger.error(f"Override YAML file not found: {args.config_file}")
-            sys.exit(1)
-        yaml_overrides_omega = OmegaConf.load(args.config_file)
-        merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
-        logger.debug("YAML overrides merged successfully.")
-
-    # Apply command-line overrides using Hydra-style parsing
-    if cli_overrides:
-        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
-        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
-        logger.debug("Hydra-style command-line overrides applied successfully.")
-
-    # Apply the final merged OmegaConf configuration back to the original ConfigContainer
-    logger.debug("Applying final merged configuration back to Python ConfigContainer...")
-    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-    # Apply overrides while preserving excluded fields
-    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-
-    # Start training
-    logger.debug("Starting pretraining...")
-    pretrain(config=cfg, forward_step_func=forward_step)
-
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/peft.md
-```md
-# Parameter-Efficient Fine-Tuning (PEFT)
-
-This guide explains how to configure and use PEFT in Megatron Bridge—covering LoRA and DoRA, required checkpoints, example configurations, and the internal design and training workflow—so you can integrate, scale, and checkpoint adapters efficiently.
-
-## Model Customization
-Customizing models enables you to adapt a general pre-trained model to a specific use case or domain. This process produces a fine-tuned model that retains the broad knowledge from pretraining while delivering more accurate outputs for targeted downstream tasks.
-
-Model customization is typically achieved through supervised fine-tuning, which falls into two main approaches: Full-Parameter Fine-Tuning, known as Supervised Fine-Tuning (SFT), and Parameter-Efficient Fine-Tuning (PEFT).
-
-In SFT, all model parameters are updated to align the model’s outputs with the task-specific requirements. This approach often yields the highest performance but can be computationally intensive.
-
-PEFT, by contrast, updates only a small subset of parameters that are inserted into the base model at strategic locations. The base model weights remain frozen, and only the adapter modules are trained. This significantly reduces the number of trainable parameters—often to less than 1%—while still achieving near-SFT levels of accuracy.
-
-As language models continue to grow in size, PEFT is gaining popularity for its efficiency and minimal hardware demands, making it a practical choice for many real-world applications.
-
-## PEFT Configuration
-
-PEFT is configured as an optional attribute in `ConfigContainer`:
-
-```python
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.peft.lora import LoRA
-
-config = ConfigContainer(
-    # ... other required configurations
-    peft=LoRA(
-        target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-        dim=16,
-        alpha=32,
-        dropout=0.1,
-    ),
-    checkpoint=CheckpointConfig(
-        pretrained_checkpoint="/path/to/pretrained/checkpoint",  # Required for PEFT
-        save="/path/to/peft/checkpoints",
-    ),
-)
-```
-
-```{note}
-**Requirements**: PEFT requires `checkpoint.pretrained_checkpoint` to be set to load the base model weights.
-```
-
-## Supported PEFT Methods
-
-### [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)
-
-LoRA makes fine-tuning efficient by representing weight updates with two low-rank decomposition matrices. The original model weights remain frozen, while the low-rank decomposition matrices are updated to adapt to the new data, keeping the number of trainable parameters low. In contrast with adapters, the original model weights and adapted weights can be combined during inference, avoiding any architectural change or additional latency in the model at inference time.
-
-In Megatron Bridge, you can configure both the adapter bottleneck dimension and the target modules where LoRA is applied. LoRA supports any linear layer, which in transformer models typically includes:
-
-1. Query, key, and value (QKV) attention projections  
-2. The attention output projection  
-3. One or both MLP layers  
-
-Megatron Bridge fuses the QKV projections into a single linear layer. As a result, LoRA learns a unified low-rank adaptation for the combined QKV representation.
-
-```python
-from megatron.bridge.peft.lora import LoRA
-
-lora_config = LoRA(
-    target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-    dim=16,                    # Rank of adaptation
-    alpha=32,                  # Scaling parameter  
-    dropout=0.1,               # Dropout rate
-)
-```
-
-#### Key Parameters
-The following table lists key hyperparameters for configuring DoRA, which control its module targeting, adaptation rank, scaling behavior, and regularization strategy.
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `target_modules` | `List[str]` | All linear layers | Modules to apply DoRA to |
-| `dim` | `int` | `32` | Rank of the low-rank adaptation |
-| `alpha` | `float` | `16` | Scaling parameter for DoRA |
-| `dropout` | `float` | `0.0` | Dropout rate for DoRA layers |
-
-#### Target Modules
-The following table lists specific submodules within transformer architectures that are commonly targeted for LoRA, enabling efficient fine-tuning of attention and feedforward components:
-| Module        | Description                                 |
-|---------------|---------------------------------------------|
-| `linear_qkv`  | Query, key, value projections in attention  |
-| `linear_proj` | Attention output projection                 |
-| `linear_fc1`  | First MLP layer                             |
-| `linear_fc2`  | Second MLP layer                            |
-
-#### Wildcard Target Modules
-For more granular targeting, individual layers can be targeted for the adapters.
-```python
-# Target specific layers only
-lora_config = LoRA(
-    target_modules=[
-        "*.layers.0.*.linear_qkv",   # First layer only
-        "*.layers.1.*.linear_qkv",   # Second layer only
-    ]
-)
-```
-
-### Canonical LoRA: Performant vs Canonical Variants
-
-There are two variants of LoRA implemented in Megatron Bridge: "performant LoRA" (`LoRA`) and "canonical LoRA" (`CanonicalLoRA`).
-
-The distinction comes from the fact that Megatron Core optimizes the implementation of the following two linear modules by fusing multiple linear layers into one layer. When these layers are adapted with LoRA, the performant version also uses only one adapter for the linear module. The two linear modules are:
-
-1. `linear_qkv`: The projection matrix in self attention that transforms hidden state to query, key and value. Megatron Core fuses these three projection matrices into a single matrix to efficiently parallelize the matrix multiplication. Hence, performant LoRA applies a single adapter to the qkv projection matrix, whereas canonical LoRA applies three adapters.
-2. `linear_fc1`: The first linear layer in the MLP module before the intermediate activation. For gated linear activations, Megatron Core fuses the up and gate projection matrices into a single matrix for efficient parallelization. Hence, performant LoRA applies a single adapter to the up and gate projection matrices, whereas canonical LoRA applies two adapters.
-
-The following two figures illustrate the difference between canonical and performant LoRA, using the `linear_qkv` layer as an example. Canonical LoRA runs three adapters sequentially, while performant LoRA runs one adapter.
-
-```{image} images/canonical_lora.png
-:width: 640
-:align: center
-```
-
-```{image} images/performant_lora.png
-:width: 400
-:align: center
-```
-
-Canonical LoRA conforms more closely to reference implementations, though it is slower in comparison since it performs several matrix multiplications sequentially, as described above. Performant LoRA has fewer parameters than canonical LoRA and can often achieve the same level of accuracy as canonical LoRA.
-
-Though not immediately apparent, performant LoRA is mathematically equivalent to canonical LoRA when the $A_q$, $A_k$, $A_v$ matrices are tied (i.e. forced to share the same weight during training) in `linear_qkv`, and similarly when the $A_{up}$, $A_{gate}$ matrices are tied in `linear_fc1`.
-
-```{admonition} Mathematical Proof: Performant LoRA Equivalence to Canonical LoRA with Tied Weights
-:class: dropdown
-
-Let $[x \quad y]$ denote matrix concatenation. (In Megatron Bridge, this concatenation is done in an interleaved fashion, but this does not affect the proof below.)
-
-Let $A_q = A_k = A_v = A_{qkv}$ (weight tying)
-
-Then
-
-$$
-\begin{align}
-& [query \quad key \quad value] \\
-= & [W_q x + B_q A_q x \quad W_k x + B_k A_k x \quad W_v x + B_v A_v x] \quad\quad \text{(canonical formulation)} \\
-= & [W_q x + B_q (A_{qkv} x) \quad W_k x + B_k (A_{qkv} x) \quad W_v x + B_v (A_{qkv} x)] \\
-= & [W_q \quad W_k \quad W_v] x + [B_q \quad B_k \quad B_v]A_{qkv} x \\
-= & W_{qkv} x + B_{qkv} A_{qkv} x  \quad\quad \text{(performant formulation)}
-\end{align}
-$$
-
-Note: dimensions of weight matrices are as follows:
-
-$$
-\begin{align}
-W_q:     &\ n_q d \times h          \qquad & A_q:     &\ r \times h \qquad  & B_q:     &\ n_q d \times r \\
-W_k:     &\ n_{kv} d \times h       \qquad & A_k:     &\ r \times h \qquad  & B_k:     &\ n_{kv} d \times r \\
-W_v:     &\ n_{kv} d \times h       \qquad & A_v:     &\ r \times h \qquad  & B_v:     &\ n_{kv} d \times r \\
-W_{qkv}: &\ (n_q+2n_{kv})d \times h \qquad & A_{qkv}: &\ r \times h \qquad  & B_{qkv}: &\ (n_q+2n_{kv})d \times r
-\end{align}
-$$
-
-Where:
-- $n_q$: Number of attention heads (`num_attention_heads`).
-- $n_{kv}$: Number of key value heads (`num_query_groups`). Note that if grouped query attention (GQA) is not used, $n_{kv} = n_q$.
-- $h$: Transformer hidden size (`hidden_size`).
-- $d$: Transformer head dimension (`kv_channels`).
-- $r$: LoRA rank.
-
-```
-
-#### Using Canonical LoRA
-
-```python
-from megatron.bridge.peft.canonical_lora import CanonicalLoRA
-
-canonical_lora_config = CanonicalLoRA(
-    target_modules=[
-        "linear_q", "linear_k", "linear_v",      # Individual Q, K, V projections
-        "linear_proj",                           # Attention output projection
-        "linear_fc1_up", "linear_fc1_gate",     # Individual up and gate projections
-        "linear_fc2"                             # Second MLP layer
-    ],
-    dim=16,                    # Rank of adaptation
-    alpha=32,                  # Scaling parameter
-    dropout=0.1,               # Dropout rate
-)
-```
-
-#### Key Parameters
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `target_modules` | `List[str]` | All canonical linear layers | Modules to apply canonical LoRA to |
-| `dim` | `int` | `32` | Rank of the low-rank adaptation |
-| `alpha` | `float` | `32` | Scaling parameter for LoRA |
-| `dropout` | `float` | `0.0` | Dropout rate for LoRA layers |
-| `dropout_position` | `Literal["pre", "post"]` | `"pre"` | Position for applying dropout |
-| `lora_A_init_method` | `str` | `"xavier"` | Initialization method for LoRA A matrix |
-| `lora_B_init_method` | `str` | `"zero"` | Initialization method for LoRA B matrix |
-
-#### Target Modules for Canonical LoRA
-
-The following table lists specific submodules within transformer architectures that are targeted for canonical LoRA:
-
-| Module | Description |
-|--------|-------------|
-| `linear_q` | Query projection in attention |
-| `linear_k` | Key projection in attention |
-| `linear_v` | Value projection in attention |
-| `linear_proj` | Attention output projection |
-| `linear_fc1_up` | Up projection in MLP |
-| `linear_fc1_gate` | Gate projection in MLP |
-| `linear_fc2` | Second MLP layer |
-
-```{note}
-Canonical LoRA does not support `linear_qkv` or `linear_fc1` targets. Use the individual component targets (`linear_q`, `linear_k`, `linear_v` for QKV and `linear_fc1_up`, `linear_fc1_gate` for FC1) instead.
-```
-
-### [DoRA: Weight-Decomposed Low-Rank Adaptation](https://arxiv.org/abs/2402.09353)
-
-DoRA decomposes the pre-trained weight into magnitude and direction. It learns a separate magnitude parameter while employing LoRA for directional updates, efficiently minimizing the number of trainable parameters. DoRA enhances both the learning capacity and training stability of LoRA, while avoiding any additional inference overhead. DoRA has been shown to consistently outperform LoRA on various downstream tasks.
-
-In Megatron Bridge, DoRA leverages the same adapter structure as LoRA. Megatron Bridge adds support for Tensor Parallelism and Pipeline Parallelism for DoRA, enabling DoRA to be scaled to larger model variants.
-
-```python
-from megatron.bridge.peft.dora import DoRA
-
-dora_config = DoRA(
-    target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-    dim=16,                    # Rank of adaptation
-    alpha=32,                  # Scaling parameter
-    dropout=0.1,               # Dropout rate
-)
-```
-
-#### Key Parameters
-
-The following parameters define how LoRA is applied to your model. They control which modules are targeted, the adaptation rank, scaling behavior, and dropout configuration:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `target_modules` | `List[str]` | All linear layers | Modules to apply DoRA to |
-| `dim` | `int` | `32` | Rank of the low-rank adaptation |
-| `alpha` | `float` | `16` | Scaling parameter for DoRA |
-| `dropout` | `float` | `0.0` | Dropout rate for DoRA layers |
-
-## Full Configuration Example
-
-```python
-from megatron.bridge.training.config import (
-    ConfigContainer, TrainingConfig, CheckpointConfig
-)
-from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig
-from megatron.bridge.data.hf_processors.squad import process_squad_example
-from megatron.bridge.peft.lora import LoRA
-from megatron.core.optimizer import OptimizerConfig
-
-# Configure PEFT fine-tuning
-config = ConfigContainer(
-    model=model_provider,
-    train=TrainingConfig(
-        train_iters=1000,
-        global_batch_size=64,
-        micro_batch_size=1,  # Required for packed sequences if used
-        eval_interval=100,
-    ),
-    optimizer=OptimizerConfig(
-        optimizer="adam",
-        lr=1e-4,  # Lower learning rate for fine-tuning
-        weight_decay=0.01,
-        bf16=True,
-        use_distributed_optimizer=True,
-    ),
-    scheduler=SchedulerConfig(
-        lr_decay_style="cosine",
-        lr_warmup_iters=100,
-        lr_decay_iters=1000,
-    ),
-    dataset=HFDatasetConfig(
-        dataset_name="squad",
-        process_example_fn=process_squad_example,
-        seq_length=512,
-    ),
-    checkpoint=CheckpointConfig(
-        pretrained_checkpoint="/path/to/pretrained/model",  # Required
-        save="/path/to/peft/checkpoints",
-        save_interval=200,
-    ),
-    peft=LoRA(
-        target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-        dim=16,
-        alpha=32,
-        dropout=0.1,
-    ),
-    # ... other configurations
-)
-```
-
-## PEFT Design in Megatron Bridge
-
-This section describes the internal design and architecture for how PEFT is integrated into Megatron Bridge.
-
-### Architecture Overview
-
-The PEFT framework introduces a modular design for integrating adapters into large-scale models. Its architecture consists of the following components:
-
-1. **Base PEFT Class**: All PEFT methods inherit from the abstract {py:class}`bridge.peft.base.PEFT` base class, which defines the core interface for module transformation.
-2. **Module Transformation**: PEFT traverses the model structure to identify and transform target modules individually.
-3. **Adapter Integration**: Adapters are injected into selected modules using a pre-wrap hook during model initialization.
-4. **Checkpoint Integration**: Only adapter parameters are saved and loaded during checkpointing; base model weights remain frozen and unchanged.
-
-### PEFT Workflow in Training
-
-The training workflow for PEFT follows a structured sequence that ensures efficient fine-tuning with minimal overhead:
-1. **Model Loading**: The base model is initialized from a specified pretrained checkpoint.
-2. **PEFT Application**: Adapter transformations are applied after Megatron Core model initialization, but before distributed wrapping.
-3. **Parameter Freezing**: Base model parameters are frozen to reduce training complexity; only adapter parameters are updated.
-4. **Adapter Weight Loading**: When resuming training, adapter weights are restored from the checkpoint.
-5. **Checkpoint Saving**: Only adapter states are saved, resulting in significantly smaller checkpoint files.
-
-### Key Benefits
-
-PEFT offers several advantages for scalable and efficient model fine-tuning:
-
-- **Reduced Checkpoint Size**: Adapter-only checkpoints are dramatically smaller than full model checkpoints.
-- **Memory Efficiency**: Since gradients are computed only for adapter parameters, memory usage is significantly reduced.
-- **Resume Support**: Training can be resumed seamlessly using adapter-only checkpoints, without reloading full model weights.
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/gpt_oss/slurm_pretrain.sh
-```sh
-#!/bin/bash
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ==============================================================================
-# GPT-OSS 20B Pretraining
-#
-# GPT-OSS 20B is an MoE language model. Supports multiple parallelism configs:
-# each "TP,PP,EP,CP,SP" runs sequentially.
-#
-# Usage:
-#   1. Modify the #SBATCH directives below for your cluster
-#   2. Set CONTAINER_IMAGE to your container path
-#   3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled)
-#   4. Submit: sbatch slurm_pretrain.sh
-# ==============================================================================
-
-#SBATCH --job-name=gpt-oss-pretrain
-#SBATCH --nodes=4
-#SBATCH --ntasks-per-node=8  # Change to 4 for GB200 (Blackwell, 4 GPUs/node)
-#SBATCH --gpus-per-node=8    # Change to 4 for GB200 (Blackwell, 4 GPUs/node)
-#SBATCH --time=24:00:00
-#SBATCH --partition=batch
-#SBATCH --account=my_account
-#SBATCH --output=logs/gpt_oss_pretrain_%j.out
-#SBATCH --error=logs/gpt_oss_pretrain_%j.err
-#SBATCH --exclusive
-
-# ==============================================================================
-# CONFIGURATION
-# ==============================================================================
-
-# Workspace directory for checkpoints and results
-WORKSPACE=${WORKSPACE:-/workspace}
-
-# Base directory for container image and mounts (set if not already set, e.g. by launch_nemo.sh)
-export WKDIR="${WKDIR:-}"
-
-# Model and training configurations
-MODEL_NAME=gpt_oss_20b
-RECIPE_NAME="${RECIPE_NAME:-${MODEL_NAME}_pretrain_config}"               # bf16 (default)
-# RECIPE_NAME="${MODEL_NAME}_pretrain_fp8_current_scaling_config"           # Hopper FP8 current scaling
-# RECIPE_NAME="${MODEL_NAME}_pretrain_mxfp8_config"                        # Blackwell MXFP8
-DATASET_NAME=dclm  # set to "mock" for mock data; "dclm" uses DCLM when DCLM_DATA_DIR/DCLM_CACHE are set below
-SEQ_LENGTH=4096
-
-# When DATASET_NAME=dclm, set DCLM_DATA_DIR and DCLM_CACHE so the recipe uses DCLM; leave unset for mock
-if [ "$DATASET_NAME" = "dclm" ]; then
-    # export DCLM_DATA_DIR="/path/to/dclm/preprocessed"
-    # export DCLM_CACHE="/path/to/cache"
-    :
-else
-    unset DCLM_DATA_DIR
-    unset DCLM_CACHE
-fi
-
-TRAIN_ITERS=1000
-GLOBAL_BATCH_SIZE=128
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
-LR_WARMUP_ITERS=50
-LOG_INTERVAL=1
-WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
-
-# Parallelism configs: "TP,PP,EP,CP,SP" per entry (max(TP*CP, EP)*PP must be divisible by the total number of GPUs)
-PARALLELISM_CONFIGS=("2,4,4,1,True" "4,2,4,1,True" "2,4,4,2,True")
-
-# Container image (required)
-CONTAINER_IMAGE=""
-# CONTAINER_IMAGE="/path/to/container.sqsh"
-
-# Container mounts (optional; comma-separated for srun --container-mounts)
-CONTAINER_MOUNTS=""
-# CONTAINER_MOUNTS="/data:/data /workspace:/workspace"
-
-# ==============================================================================
-# Environment Setup
-# ==============================================================================
-
-# NCCL optimizations for large-scale training
-export TORCH_NCCL_AVOID_RECORD_STREAMS=1
-export NCCL_NVLS_ENABLE=0
-
-# UV cache on shared filesystem (recommended for multi-node setups)
-# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync
-# export UV_CACHE_DIR="/path/to/shared/uv_cache"
-
-# HuggingFace cache directory (recommended for shared filesystem)
-# export HF_HOME="/path/to/shared/HF_HOME"
-
-# Authentication tokens (set these for your environment)
-# export HF_TOKEN="hf_your_token_here"
-# export WANDB_API_KEY="your_wandb_key_here"
-
-# ==============================================================================
-# Job Execution
-# ==============================================================================
-
-echo "======================================"
-echo "GPT-OSS 20B Pretraining Job"
-echo "======================================"
-echo "Job ID: $SLURM_JOB_ID"
-echo "Nodes: $SLURM_JOB_NUM_NODES"
-echo "GPUs per node: $SLURM_GPUS_PER_NODE"
-echo "Model: $MODEL_NAME"
-echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}"
-echo "======================================"
-
-# Create logs directory if it doesn't exist
-mkdir -p logs
-
-# Require container image
-if [ -z "$CONTAINER_IMAGE" ]; then
-    echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image."
-    exit 1
-fi
-
-# Build srun command (shared across configs)
-SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
-if [ -n "$CONTAINER_MOUNTS" ]; then
-    SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS"
-fi
-echo "SRUN base: $SRUN_CMD"
-echo "======================================"
-
-# If using DCLM, pass dataset config via CLI overrides
-DCLM_DATASET_OVERRIDES=""
-if [ -n "${DCLM_DATA_DIR:-}" ] && [ -n "${DCLM_CACHE:-}" ]; then
-    BLEND_PATHS=""
-    for i in $(seq 1 10); do
-        pad=$(printf "%02d" $i)
-        PREFIX="${DCLM_DATA_DIR}/dclm_01_${pad}_text_document"
-        if [ -f "${PREFIX}.bin" ]; then
-            BLEND_PATHS="${BLEND_PATHS}\"${PREFIX}\","
-        fi
-    done
-    BLEND_PATHS="${BLEND_PATHS%,}"
-    
-    if [ -n "$BLEND_PATHS" ]; then
-        DCLM_DATASET_OVERRIDES="dataset.blend=[[${BLEND_PATHS}],null] dataset.split='\"9999,8,2\"' dataset.path_to_cache=${DCLM_CACHE}"
-    else
-        echo "WARNING: No DCLM data found in ${DCLM_DATA_DIR}!"
-    fi
-fi
-
-# Run each parallelism config in sequence
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-CONFIG_INDEX=0
-for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do
-    OLD_IFS=$IFS
-    IFS=',' read -r TP PP EP CP SP <<< "$CONFIG"
-    IFS=$OLD_IFS
-
-    CONFIG_INDEX=$((CONFIG_INDEX + 1))
-    
-    echo ""
-    echo "======================================"
-    echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP"
-    echo "======================================"
-
-    # Build CLI overrides for this config
-    CLI_OVERRIDES=" \
-        model.seq_length=$SEQ_LENGTH \
-        train.train_iters=$TRAIN_ITERS \
-        train.global_batch_size=$GLOBAL_BATCH_SIZE \
-        train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
-        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
-        logger.log_interval=$LOG_INTERVAL \
-        logger.wandb_project=$WANDB_PROJECT \
-        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
-        dataset.sequence_length=$SEQ_LENGTH \
-        model.tensor_model_parallel_size=$TP \
-        model.pipeline_model_parallel_size=$PP \
-        model.expert_model_parallel_size=$EP \
-        model.sequence_parallel=$SP \
-        model.context_parallel_size=$CP \
-    "
-    if [ -n "$DCLM_DATASET_OVERRIDES" ]; then
-        CLI_OVERRIDES="$CLI_OVERRIDES $DCLM_DATASET_OVERRIDES"
-    fi
-    CMD="uv run --no-sync python /opt/Megatron-Bridge/scripts/training/run_recipe.py"
-    CMD="$CMD --recipe ${RECIPE_NAME}"
-    CMD="$CMD $CLI_OVERRIDES"
-
-    echo "Executing command..."
-    echo "$CMD"
-    echo "======================================"
-
-    $SRUN_CMD bash -c "$CMD"
-    RUN_EXIT=$?
-    if [ $RUN_EXIT -ne 0 ]; then
-        echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT"
-        continue
-    fi
-done
-
-echo "======================================"
-echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)"
-echo "======================================"
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/logging.md
-```md
-# Logging and Monitoring
-
-This guide describes how to configure logging in Megatron Bridge. It introduces the high-level `LoggerConfig`, explains experiment logging to TensorBoard and Weights & Biases (W&B), and documents console logging behavior.
-
-## LoggerConfig Overview
-
-{py:class}`~bridge.training.config.LoggerConfig` is the dataclass that encapsulates logging‑related settings for training. It resides inside the overall {py:class}`bridge.training.config.ConfigContainer`, which represents the complete configuration for a training run.
-
-### Timer Configuration Options
-
-Use the following options to control which timing metrics are collected during training and how they are aggregated and logged.
-
-#### `timing_log_level`
-Controls which timers are recorded during execution:
-
-- **Level 0**: Logs only the overall iteration time.
-- **Level 1**: Includes once-per-iteration operations, such as gradient all-reduce.
-- **Level 2**: Captures frequently executed operations, providing more detailed insights but with increased overhead.
-
-#### `timing_log_option`
-Specifies how timer values are aggregated across ranks. Valid options:
-
-- `"max"`: Logs the maximum value across ranks.
-- `"minmax"`: Logs both minimum and maximum values.
-- `"all"`: Logs all values from all ranks.
-
-#### `log_timers_to_tensorboard`
-When enabled, the framework records timer metrics to supported backends such as TensorBoard.
-
-
-### Diagnostic Options
-
-The framework provides several optional toggles for enhanced monitoring and diagnostics:
-
-- **Loss Scale**: Enables dynamic loss scaling for mixed-precision training.
-- **Validation Perplexity**: Tracks model perplexity during validation.
-- **CUDA Memory Statistics**: Reports detailed GPU memory usage.
-- **World Size**: Displays the total number of distributed ranks.
-
-### Logging Options
-
-Use the following options to enable additional diagnostics and performance monitoring during training.
-
-- **`log_params_norm`**: Computes and logs the L2 norm of model parameters. If available, it also logs the gradient norm.
-- **`log_energy`**: Activates the energy monitor, which records per-GPU energy consumption and instantaneous power usage.
-- **`log_memory`**: Logs the memory usage of the model from `torch.cuda.memory_stats()`.
-- **`log_throughput_to_tensorboard`**: Calculates the training throughput and utilization.
-- **`log_runtime_to_tensorboard`**: Estimates total time remaining until the end of the training.
-- **`log_l2_norm_grad_to_tensorboard`**: Computes and logs the L2 norm of gradients for each model layer.
-
-
-## Experiment Logging
-Both TensorBoard and W&B are supported for metric logging. When using W&B, it’s recommended to also enable TensorBoard to ensure that all scalar metrics are consistently logged across backends.
-
-### TensorBoard
-
- 
-#### What Gets Logged
-
-TensorBoard captures a range of training and system metrics, including:
-
-- **Learning rate**, including decoupled LR when applicable
-- **Per-loss scalars** for detailed breakdowns
-- **Batch size** and **loss scale**
-- **CUDA memory usage** and **world size** (if enabled)
-- **Validation loss**, with optional **perplexity**
-- **Timers**, when timing is enabled
-- **Energy consumption** and **instantaneous power**, if energy logging is active
-
-
-#### Enable TensorBoard Logging
-  1) Install TensorBoard (if not already available):
-  ```bash
-  pip install tensorboard
-  ```
-  2) **Configure logging** in your training setup. In these examples, `cfg` refers to a `ConfigContainer` instance (such as one produced by a recipe), which contains a `logger` attribute representing the `LoggerConfig`:
-  
-  ```python
-  from megatron.bridge.training.config import LoggerConfig
-
-  cfg.logger = LoggerConfig(
-      tensorboard_dir="./runs/tensorboard",
-      tensorboard_log_interval=10,
-      log_timers_to_tensorboard=True,   # optional
-      log_memory_to_tensorboard=False,  # optional
-  )
-  ```
-
-  ```{note}
-  The writer is created lazily on the last rank when `tensorboard_dir` is set.
-  ```
-
-#### Set the Output Directory
-
-TensorBoard event files are saved to the directory specified by `tensorboard_dir`.
-
-**Example with additional metrics enabled:**
-```python
-cfg.logger.tensorboard_dir = "./logs/tb"
-cfg.logger.tensorboard_log_interval = 5
-cfg.logger.log_loss_scale_to_tensorboard = True
-cfg.logger.log_validation_ppl_to_tensorboard = True
-cfg.logger.log_world_size_to_tensorboard = True
-cfg.logger.log_timers_to_tensorboard = True
-```
-
-### Weights & Biases (W&B)
-
-  
-#### What Gets Logged
-
-When enabled, W&B automatically mirrors the scalar metrics logged to TensorBoard.  
-In addition, the full run configuration is synced at initialization, allowing for reproducibility and experiment tracking.
-
-
-#### Enable W&B Logging
-
-  1) Install W&B (if not already available):
-  ```bash
-  pip install wandb
-  ```
-  2) Authenticate with W&B using one of the following methods:
-  - Set `WANDB_API_KEY` in the environment before the run, or
-  - Run `wandb login` once on the machine.
-  2) **Configure logging** in your training setup. In these examples, `cfg` refers to a `ConfigContainer` instance (such as one produced by a recipe), which contains a `logger` attribute representing the `LoggerConfig`:
-  
-  ```python
-  from megatron.bridge.training.config import LoggerConfig
-
-  cfg.logger = LoggerConfig(
-      tensorboard_dir="./runs/tensorboard",   # recommended: enables shared logging gate
-      wandb_project="my_project",
-      wandb_exp_name="my_experiment",
-      wandb_entity="my_team",                 # optional
-      wandb_save_dir="./runs/wandb",          # optional
-  )
-  ```
-  
-```{note}
-W&B is initialized lazily on the last rank when `wandb_project` is set and `wandb_exp_name` is non-empty.
-```  
-
-#### W&B Configuration with NeMo Run Launching
-
-For users launching training scripts with NeMo Run, W&B can be optionally configured using the {py:class}`bridge.recipes.run_plugins.WandbPlugin`.
-
-The plugin automatically forwards the `WANDB_API_KEY` and by default injects CLI overrides for the following logger parameters:
-
-- `logger.wandb_project`  
-- `logger.wandb_entity`  
-- `logger.wandb_exp_name`  
-- `logger.wandb_save_dir`
-
-This allows seamless integration of W&B logging into your training workflow without manual configuration.
-
-
-### MLFlow
-
-Megatron Bridge can log metrics and artifacts to MLFlow, following the same pattern as the W&B integration.
-
-#### What Gets Logged
-
-When enabled, MLFlow receives:
-
-- Training configuration as run parameters
-- Scalar metrics (losses, learning rate, batch size, throughput, timers, memory, runtime, norms, energy, etc.)
-- Checkpoint artifacts saved under an experiment-specific artifact path per iteration
-
-#### Enable MLFlow Logging
-
-  1) Install MLFlow (installed by default with Megatron Bridge):
-
-  ```bash
-  pip install mlflow / uv add mlflow
-  ```
-
-  2) Configure the tracking server (Optional):
-  - Either set `MLFLOW_TRACKING_URI` in the environment, or
-  - Pass an explicit `mlflow_tracking_uri` in the logger config.
-
-  3) Configure logging in your training setup.
-
-  ```python
-  from megatron.bridge.training.config import LoggerConfig
-
-  cfg.logger = LoggerConfig(
-      tensorboard_dir="./runs/tensorboard",
-      mlflow_experiment="my_megatron_experiment",
-      mlflow_run_name="llama32_1b_pretrain_run",
-      mlflow_tracking_uri="http://mlflow:5000",  # optional
-      mlflow_tags={                              # optional
-          "project": "llama32",
-          "phase": "pretrain",
-      },
-  )
-  ```
-
-
-
-### Comet ML
-
-Megatron Bridge can log metrics and experiment metadata to Comet ML, following the same pattern as the W&B and MLFlow integrations.
-
-#### What Gets Logged
-
-When enabled, Comet ML receives:
-
-- Training configuration as experiment parameters
-- Scalar metrics (losses, learning rate, batch size, throughput, timers, memory, runtime, norms, energy, etc.)
-- Validation loss and perplexity metrics
-- Checkpoint save/load metadata
-
-#### Enable Comet ML Logging
-
-  1) Install Comet ML:
-
-  ```bash
-  pip install comet-ml
-  ```
-
-  2) Authenticate:
-  - Either set `COMET_API_KEY` in the environment, or
-  - Pass an explicit `comet_api_key` in the logger config.
-
-  3) Configure logging in your training setup.
-
-  ```python
-  from megatron.bridge.training.config import LoggerConfig
-
-  cfg.logger = LoggerConfig(
-      tensorboard_dir="./runs/tensorboard",
-      comet_project="my_project",
-      comet_experiment_name="llama32_1b_pretrain_run",
-      comet_workspace="my_workspace",          # optional
-      comet_tags=["pretrain", "llama32"],       # optional
-  )
-  ```
-
-```{note}
-Comet ML is initialized lazily on the last rank when `comet_project` is set and `comet_experiment_name` is non-empty.
-```
-
-#### Comet ML Configuration with NeMo Run Launching
-
-For users launching training scripts with NeMo Run, Comet ML can be optionally configured using the {py:class}`bridge.recipes.run_plugins.CometPlugin`.
-
-The plugin automatically forwards the `COMET_API_KEY` and by default injects CLI overrides for the following logger parameters:
-
-- `logger.comet_project`
-- `logger.comet_workspace`
-- `logger.comet_experiment_name`
-
-
-#### Progress Log
-
-When `logger.log_progress` is enabled, the framework generates a `progress.txt` file in the checkpoint save directory.
-
-This file includes:
-- **Job-level metadata**, such as timestamp and GPU count
-- **Periodic progress entries** throughout training
-
-At each checkpoint boundary, the log is updated with:
-- **Job throughput** (TFLOP/s/GPU)
-- **Cumulative throughput**
-- **Total floating-point operations**
-- **Tokens processed**
-
-This provides a lightweight, text-based audit trail of training progress, useful for tracking performance across restarts.
-
-
-## Tensor Inspection
-
-Megatron Bridge integrates with TransformerEngine's tensor inspection features via NVIDIA DLFW Inspect. This integration, controlled by {py:class}`~bridge.training.config.TensorInspectConfig`, enables advanced debugging and analysis of tensor statistics during training. When enabled, the framework handles initialization, step tracking, and cleanup automatically.
-
-```{note}
-**Current limitations:** Tensor inspection is currently supported only for linear modules in TransformerEngine (e.g., `fc1`, `fc2`, `layernorm_linear`). Operations like attention are not supported.
-```
-
-```{note}
-This section covers Megatron Bridge configuration. For comprehensive documentation on features, configuration syntax, and advanced usage, see:
-
-- [TransformerEngine Debug Documentation](https://github.com/NVIDIA/TransformerEngine/tree/af2a0c16ec11363c0af84690cd877a59f898820e/docs/debug)
-- [NVIDIA DLFW Inspect Documentation](https://github.com/NVIDIA/nvidia-dlfw-inspect/tree/4118044cc84f0183714a2ab1bc215fa49f9aaa82/docs)
-```
-
-### Installation
-
-Install NVIDIA DLFW Inspect if not already available:
-```bash
-pip install nvdlfw-inspect
-```
-
-### Available Features
-
-TransformerEngine provides the following debug features:
-
-- **LogTensorStats** – Logs high-precision tensor statistics: `min`, `max`, `mean`, `std`, `l1_norm`, `l2_norm`, `cur_amax`, `dynamic_range`.
-- **LogFp8TensorStats** – Logs quantized tensor statistics for FP8 recipes: `underflows%`, `scale_inv_min`, `scale_inv_max`, `mse`. Supports simulating alternative recipes (e.g., tracking `mxfp8_underflows%` during per-tensor current-scaling training)
-- **DisableFP8GEMM** – Runs specific GEMM operations in high precision
-- **DisableFP8Layer** – Disables FP8 for entire layers
-- **PerTensorScaling** – Enables per-tensor current scaling for specific tensors
-- **FakeQuant** – Experimental quantization testing
-
-See [TransformerEngine debug features](https://github.com/NVIDIA/TransformerEngine/tree/af2a0c16ec11363c0af84690cd877a59f898820e/transformer_engine/debug/features) for complete parameter lists and usage details.
-
-### Configuration
-
-Configure tensor inspection using {py:class}`~bridge.training.config.TensorInspectConfig` with either a YAML file or inline dictionary.
-
-#### YAML Configuration
-
-```yaml
-tensor_inspect:
-  enabled: true
-  features: ./conf/fp8_tensor_stats.yaml
-  log_dir: ./logs/tensor_inspect
-```
-
-**Example feature configuration file:**
-
-```yaml
-fp8_tensor_stats:
-  enabled: true
-  layers:
-    layer_name_regex_pattern: ".*(fc2)"
-  transformer_engine:
-    LogFp8TensorStats:
-      enabled: true
-      tensors: [weight,activation,gradient]
-      stats: ["underflows%", "mse"]
-      freq: 5
-      start_step: 0
-      end_step: 100
-```
-
-#### Python Configuration
-
-```python
-from bridge.training.config import TensorInspectConfig
-
-# Option 1: inline python dict
-cfg.tensor_inspect = TensorInspectConfig(
-    enabled=True,
-    features={
-        "fp8_gradient_stats": {
-            "enabled": True,
-            "layers": {"layer_name_regex_pattern": ".*(fc1|fc2)"},
-            "transformer_engine": {
-                "LogFp8TensorStats": {
-                    "enabled": True,
-                    "tensors": ["weight","activation","gradient"],
-                    "stats": ["underflows%", "mse"],
-                    "freq": 5,
-                    "start_step": 0,
-                    "end_step": 100,
-                },
-            },
-        }
-    },
-    log_dir="./logs/tensor_inspect",
-)
-
-# Option 2: reference external YAML
-cfg.tensor_inspect = TensorInspectConfig(
-    enabled=True,
-    features="./conf/fp8_inspect.yaml",
-    log_dir="./logs/tensor_inspect",
-)
-
-```
-
-#### Layer Selection
-
-Features apply to linear modules matched by selectors in the `layers` section:
-
-- `layer_name_regex_pattern: .*` – All supported linear layers
-- `layer_name_regex_pattern: .*layers\.(0|1|2).*(fc1|fc2|layernorm_linear)` – Linear modules in first three transformer layers
-- `layer_name_regex_pattern: .*(fc1|fc2)` – MLP projections only
-- `layer_types: [layernorm_linear, fc1]` – String matching (alternative to regex)
-
-Tensor-level selectors (`tensors`, `tensors_struct`) control which tensor roles are logged: `activation`, `gradient`, `weight`, `output`, `wgrad`, `dgrad`.
-
-### Output and Monitoring
-
-Tensor statistics are written to `tensor_inspect.log_dir` and forwarded to TensorBoard/W&B when enabled.
-
-**Log locations:**
-- Text logs: `<log_dir>/nvdlfw_inspect_statistics_logs/`
-- TensorBoard
-- W&B
-
-### Performance Considerations
-
-- Use `freq > 1` to reduce overhead. Statistics collection is expensive for large models.
-- Narrow layer selection with specific regex patterns rather than `.*`
-
-
-## Console Logging
-
-Megatron Bridge uses the standard Python logging subsystem for console output. 
-
-### Configure Console Logging
-
-To control console logging behavior, use the following configuration options:
-
-- `logging_level` sets the default verbosity level. It can be overridden via the `MEGATRON_BRIDGE_LOGGING_LEVEL` environment variable.
-- `filter_warnings` suppresses messages at the WARNING level.
-- `modules_to_filter` specifies logger name prefixes to exclude from output.
-- `set_level_for_all_loggers` determines whether the logging level is applied to all loggers or only a subset, depending on the current implementation.
-
-
-### Monitor Logging Cadence and Content
-
-To monitor training progress at regular intervals, the framework prints a summary line every `log_interval` iterations.
-
-Each summary includes:
-- **Timestamp**
-- **Iteration counters**
-- **Consumed and skipped samples**
-- **Iteration time (ms)**
-- **Learning rates**
-- **Global batch size**
-- **Per-loss averages**
-- **Loss scale**
-
-When enabled, additional metrics are printed:
-- **Gradient norm**
-- **Zeros in gradients**
-- **Parameter norm**
-- **Energy and power per GPU**
-
-Straggler timing reports follow the same `log_interval` cadence, helping identify performance bottlenecks across ranks.
-
-
-### Minimize Timing Overhead
-
-To reduce performance impact, set `timing_log_level` to `0`.  
-Increase to `1` or `2` only when more detailed timing metrics are required, as higher levels introduce additional logging overhead.
-
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/mixed-precision.md
-```md
-# Mixed Precision Training
-
-Mixed precision training significantly enhances computational efficiency by conducting operations in low-precision format, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. Megatron Bridge supports FP16, BF16, and FP8 via Transformer Engine (TE) across most models through the {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` configuration.
-
-## Configuration Overview
-
-Mixed precision is configured in Megatron Bridge through the `mixed_precision` field in {py:class}`bridge.training.config.ConfigContainer`, which accepts either:
-- A string name referencing a predefined recipe (e.g., `"bf16_mixed"`)  
-- A {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` object for custom configurations
-
-The mixed precision configuration automatically updates the model, optimizer, and distributed data parallel settings with the appropriate precision parameters.
-
-## Half-Precision Training
-
-Megatron Bridge supports half-precision FP16 and BF16 computation training via Megatron Core and the distributed optimizer. This training recipe uses half-precision in all layer computation while keeping the model states (optimizer states and master parameters) in single-precision. To avoid repeated data type casting at each layer computation, Megatron Core keeps a separate copy of half-precision parameters that is updated after each optimizer step.
-
-### Using Predefined Recipes
-
-The simplest way to enable mixed precision is using predefined recipe names:
-
-```python
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure with BF16 mixed precision
-config = ConfigContainer(
-    mixed_precision="bf16_mixed",
-    # ... other config parameters
-)
-
-# Configure with FP16 mixed precision  
-config = ConfigContainer(
-    mixed_precision="fp16_mixed",
-    # ... other config parameters
-)
-```
-
-### Custom Mixed Precision Configuration
-
-For more control, create a custom {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig`:
-
-```python
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-import torch
-
-# Custom BF16 configuration
-bf16_config = MixedPrecisionConfig(
-    bf16=True,
-    params_dtype=torch.bfloat16,
-    pipeline_dtype=torch.bfloat16,
-    autocast_enabled=False,
-    grad_reduce_in_fp32=True,
-)
-
-config = ConfigContainer(
-    mixed_precision=bf16_config,
-    # ... other config parameters
-)
-```
-
-## FP8 Training
-
-NVIDIA H100 GPU introduced support for a new datatype, FP8 (8-bit floating point), enabling higher throughput of matrix multiplies and convolutions. Megatron Bridge uses the NVIDIA TransformerEngine (TE) to leverage speedups from FP8. For a more detailed overview, refer to the [TE documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html), specifically the FP8 format and recipe.
-
-### FP8 Configuration Parameters
-
-The {py:class}`bridge.training.mixed_precision.MixedPrecisionConfig` provides several FP8-specific parameters:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `fp8` | `Optional[str]` | `None` | FP8 format: `"hybrid"` (E4M3 for activations/weights, E5M2 for gradients) or `"e4m3"` |
-| `fp8_recipe` | `str` | `"tensorwise"` | FP8 recipe type: `"tensorwise"`, `"delayed"`, `"blockwise"`, `"mxfp8"` (Blackwell only) |
-| `first_last_layers_bf16` | `bool` | `False` | If True, retains first and last N TransformerBlocks in BF16 as opposed to FP8 |
-| `num_layers_at_start_in_bf16` | `int` | `0` | Number of layers at the start of the model to keep in BF16 precision when `first_last_layers_bf16` is True |
-| `num_layers_at_end_in_bf16` | `int` | `0` | Number of layers at the end of the model to keep in BF16 precision when `first_last_layers_bf16` is True |
-| `fp8_margin` | `int` | `0` | Scaling factor shift by $2^{margin}$ |
-| `fp8_amax_history_len` | `int` | `1` | Window size for amax history storage |
-| `fp8_amax_compute_algo` | `str` | `"most_recent"` | Amax selection algorithm: `"max"` or `"most_recent"` |
-| `fp8_param` | `Optional[bool]` | `None` | Store module-level parameters in FP8 |
-| `fp8_param_gather` | `bool` | `False` | Enable FP8 parameter gathering |
-
-### FP8 Recipe Examples
-
-Use any of the predefined FP8 recipe names with the `mixed_precision` parameter:
-
-```python
-# Example: BF16 with FP8 current scaling
-config = ConfigContainer(
-    mixed_precision="bf16_with_fp8_current_scaling_mixed",
-    # ... other config parameters
-)
-```
-
-## Available Mixed Precision Recipes
-
-Megatron Bridge provides numerous predefined mixed precision recipes for different use cases. You can use the {py:func}`~megatron.bridge.training.mixed_precision.get_mixed_precision_config` utility function to convert from a string shortname to a class instance. For the complete list of available recipes and their specific configurations, see the {py:mod}`megatron.bridge.training.mixed_precision` module.
-
-
-### Custom FP8 Configuration
-
-For advanced use cases, create a custom FP8 configuration:
-
-```python
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig
-import torch
-
-# Custom FP8 configuration
-fp8_config = MixedPrecisionConfig(
-    bf16=True,
-    params_dtype=torch.bfloat16,
-    pipeline_dtype=torch.bfloat16,
-    fp8="hybrid",
-    fp8_recipe="tensorwise", 
-    fp8_margin=0,
-    fp8_amax_history_len=1024,
-    fp8_amax_compute_algo="max",
-    fp8_param_gather=True,
-)
-
-config = ConfigContainer(
-    mixed_precision=fp8_config,
-    # ... other config parameters
-)
-```
-
-### Registering Custom Mixed Precision Recipes
-
-You can also register your own custom mixed precision configurations to work with the shortname system. Use the {py:func}`~megatron.bridge.training.mixed_precision.register` decorator on a function that returns a `MixedPrecisionConfig` object:
-
-```python
-from megatron.bridge.training.mixed_precision import register, MixedPrecisionConfig
-
-@register
-def my_custom_fp8_recipe() -> MixedPrecisionConfig:
-    """Custom FP8 recipe with specific settings for my use case."""
-    return MixedPrecisionConfig(
-        bf16=True,
-        fp8="hybrid",
-        fp8_recipe="tensorwise",
-        fp8_param_gather=True,
-        # ... other custom settings
-    )
-
-# Now you can use it with the utility function
-config = get_mixed_precision_config("my_custom_fp8_recipe")
-```
-
-Common recipe categories include:
-- **Half-precision recipes**: Basic BF16 and FP16 mixed precision
-- **FP8 recipes**: Various FP8 scaling strategies (delayed, current, subchannel)
-- **Architecture-specific recipes**: Optimized for specific GPU architectures (Hopper, Blackwell)
-- **Model-specific recipes**: Tuned for particular model families
-
-## Configuration Synchronization
-
-When a mixed precision configuration is provided, it automatically synchronizes precision-related settings across the model, optimizer, and distributed data parallel (DDP) configurations. This ensures consistent precision behavior throughout the training pipeline.
-
-**Important**: Mixed precision settings will override any conflicting precision parameters that may have been set directly on the model, optimizer, or DDP configurations. The mixed precision configuration acts as the authoritative source for all precision-related parameters.
-
-For example, if you specify both:
-```python
-# This will be overridden
-model_config.bf16 = False
-optimizer_config.bf16 = False
-
-config = ConfigContainer(
-    model=model_config,
-    optimizer=optimizer_config,
-    mixed_precision="bf16_mixed",  # This takes precedence during training
-    # ... other configs
-)
-```
-
-The mixed precision configuration will set `bf16=True` on both the model and optimizer configs, overriding the explicitly set `False` values. This synchronization prevents configuration mismatches that could lead to training issues.
-
-## Performance Considerations
-
-- **FP8 recipes are experimental** and convergence has not been fully validated for all models
-- **BF16** is generally recommended over FP16 for better numerical stability
-- **FP8** provides the best performance on H100 GPUs but requires careful tuning
-- **MXFP8** recipes are only supported on Blackwell architecture GPUs
-- **Blockwise scaling** recipes are optimized for Hopper architecture GPUs
-
-## Resources
-
-- [Transformer Engine Documentation](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html)
-- [Intro to FP8, floating point formats, and mixed precision training](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html#Introduction-to-FP8)
-- [Performance optimizations](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/advanced_optimizations.html) that are natively supported in Megatron Bridge by enabling FP8 training with TE
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/performance-guide.md
-```md
-# Performance Tuning Guide
-
-Megatron-Bridge provides a wide range of features for performant and memory-efficient LLM training on GPUs, and comes pre-configured with optimal settings. However, factors such as model architecture, hyperparameters, GPU count, and GPU type can affect the available options, and additional tuning may be necessary to achieve optimal performance. This document explores the factors that affect training performance, highlights common issues, and outlines techniques for performance tuning that lead to higher MFU (Model FLOPS Utilization) and TCO.
-
-```{Note}
-This guide makes references to several configuration settings. These settings will be referenced relative to the the config class that contains them, e.g. `OptimizerConfig.lr`. Please see <project:apidocs/index.rst> for more details on configuration settings.
-```
-
-```{Note}
-This guide references several configuration settings from `TransformerConfig`. Please apply these to the appropriate ModelProvider for your model, e.g. `GPTModelProvider`, as the `ConfigContainer` does not accept a raw `TransformerConfig`.
-```
-
-## Low Precision Training
-
-1. Expected speedup of FP8 training compared to BF16 training
-
-   > 1. The default low-precision LLM training recipe applies FP8 computation exclusively to the linear layers within the Transformer block, typically achieving a speedup of 1.2–1.5X.
-   > 2. However, the actual speedup depends on the proportion of training time spent on these linear layers. For instance, smaller LLMs with a limited hidden size exhibit lower FP8 speedup, as linear layers scale with O(sequence_length × hidden_size²) complexity, whereas the other element-wise computation layers (e.g., layer norms, dropouts, RoPE, and simple math functions) scale with O(sequence_length × hidden_size), and dot-product attention scales with O(sequence_length² × hidden_size). Consequently, the contribution of linear layers to the overall training time is smaller in such models.
-   > 3. Different FP8 recipes use varying quantization block sizes, affecting performance. Smaller quantization blocks generally incur higher overhead in both quantization and GEMM execution. For example, MXFP8 with a 1×32 quantization block performs less efficiently than full tensor-wise FP8 scaling.
-
-2. Common issues of low FP8 training speedup
-
-   > 1. Host performance boundness when LLM uses small GPU kernels (see [Lowering Host Overhead and Jitters](#lowering-overhead-jitter)).
-   > 2. A low proportion of linear layers in training step time that use FP8 computation.
-
-## Parallel Mapping Strategies
-
-1. Data Parallelism using Distributed Optimizer
-
-   > 1. You should begin with data-parallel (DP) mapping. As long as the model and activation memory fit within the GPUs, data parallelism generally offers optimal performance, minimizes communication overhead, and maximizes per-GPU tensor sizes (compared to per-tensor sharding).
-   >
-   > 2. Megatron-Bridge uses the distributed optimizer as the default method for data-parallel training. It shards master parameters and optimizer states across data-parallel ranks, reducing model state memory usage without increasing communication overhead compared to traditional data-parallel training.
-   >
-   >    > 1. `OptimizerConfig.use_distributed_optimizer=true`
-
-2. Per-tensor Sharding (Tensor-parallel or Context-parallel mappings)
-
-   > 1. Tensor parallelism (TP) is the primary recommendation when a model exceeds GPU memory capacity under data-parallel mapping. However, since it involves higher communication overhead, the tensor-parallel size should ideally be confined to the high-bandwidth intra-node network (NVLink domain).
-   >
-   >    > 1. `TransformerConfig.tensor_model_parallel_size=<int>`
-   >
-   > 2. When the sequence length in a training run is significantly larger than the hidden size, activation memory can overflow. In such cases, context parallelism (CP) helps by sharding tensors along the sequence dimension, allowing the workload to fit within limited GPU memory and improving performance. Like tensor parallelism (TP), CP requires inter-GPU communication of activations. However, for the same tensor sizes, CP generally results in lower communication volume.
-
-That said, CP’s effectiveness depends on the relative sizes of the sequence length and hidden size. When the sequence length is smaller than the hidden size, CP produces narrow (or "skinny") tensor shards on each GPU. This reduces data reuse and can degrade performance.
-
-Additionally, because CP shards activations, it also partitions optimizer states in distributed training. As a result, optimizer state partitioning spans both the data parallel (DP) and context parallel (CP) dimensions.
-
-> > 1. `TransformerConfig.context_parallel_size=<int>`
->
-> 1. Performance tips:
->
->    > 1. A large tensor-parallel or context-parallel size is not recommended unless the hidden size or sequence length is large enough to maintain sufficient per-GPU parallelism and avoid excessive communication overhead. For example, using a tensor-parallel size of 8 for LLAMA 3 70B could lead to low GPU utilization and make training host-performance bound.
->    > 2. You can combine TP and CP to optimize performance by balancing communication overhead. For example, using TP=2 along with CP=2 can give better performance than TP=4 when the sequence size is larger than the hidden size.
->    > 3. For additional tips, see [Long Sequence Training](#long-sequence-train).
-
-1. Pipeline Parallelism
-
-   > 1. Pipeline parallelism (PP) is necessary when a model cannot fit within GPU memory using tensor parallelism. Also, virtual pipeline parallelism (VPP) should be used in conjunction with pipeline parallelism to reduce the overhead caused by pipeline warm-up and flush bubbles.
-   >
-   >    > 1. `TransformerConfig.pipeline_model_parallel_size=<int>`
-   >    > 2. `TransformerConfig.virtual_pipeline_model_parallel_size=<int>`
-   >
-   > 2. Performance tips in PP and VPP sizing:
-   >
-   >    > 1. PP can also be combined with per-tensor sharding methods to mitigate the impact of sharding inefficiencies and pipeline bubbles. For instance, TP4 + PP2 may outperform TP8 when both mappings fit into memory because using a large TP reduces per-GPU tensor sizes but increases the communication cost, increasing the exposed communication.
-   >    > 2. VPP increases inter-stage communication overhead. When a global batch contains many micro-batches, using a smaller VPP size can improve performance, as the exposed communication cost outweighs the reduction in pipeline bubbles.
-   >
-   > 3. Asymmetric Transformer layer allocation across pipeline stages
-   >
-   >    > 1. An LLM with a large vocabulary size has computationally heavy embedding lookup and projection operations, leading to load imbalance across pipeline stages. To address this, Megatron-Bridge provides an option to allocate one fewer Transformer layer in the first and last pipeline stages, which handle embedding lookup and projection, to better balance workloads.
-   >    >
-   >    >    > 1. `GPTProvider.account_for_embedding_in_pipeline_split=true`
-   >    >    > 2. `GPTProvider.account_for_loss_in_pipeline_split=true`
-
-2. Expert Parallelism
-
-   > 1. Expert Parallelism (EP) is designed specifically for Mixture-of-Experts (MoE) models to efficiently distribute sparse MLP weights across multiple chips. It can be used in combination with other parallelism strategies such as Tensor Parallelism (TP), Context Parallelism (CP), Pipeline Parallelism (PP), Data Parallelism (DP), and Fully Sharded Data Parallel (FSDP). In the current design, the dense attention part and the sparse MLP part are fully decoupled in terms of their TP, CP, and DP parallelism configurations. Expert Tensor Parallelism (ETP) is introduced to specifically control the tensor parallelism for the sparse MLP part. ETP uses TP for dense layers for the ranks allocated for EP in sparse layers. On the other hand, the baseline is DEP, which folds DP in dense layers for EP in sparse layers.
-   >
-   >    > 1. `TransformerConfig.expert_model_parallel_size=<int>`
-   >    > 2. `TransformerConfig.expert_tensor_parallel_size=<int>`
-   >
-   > 2. Performance tips in hybrid folding options and EP sizing:
-   >
-   >    > 1. Typically, EP is kept within the high-bandwidth intra-node network (NVLink domain) to minimize the communication overhead it can introduce. However, using communication overlap techniques—such as pipeline overlap or 1F1B overlap—along with PP (e.g., DualPipe) might make it possible to expand EP into the inter-node networks.
-   >    >
-   >    > 2. Within the sparse MLP block, DP replaces CP because it has no impact on the computation pattern based on the dispatched tokens in each EP rank.
-   >    >
-   >    > 3. Usually, ETP is set to 1 to avoid significant communication overhead that comes with applying TP to MLP GEMMs.
-   >    >
-   >    > 4. When multiple experts are placed on a single chip after applying Expert Parallelism, enabling grouped GEMM can significantly improve computation efficiency.
-   >    >
-   >    >    > 1. `TransformerConfig.moe_grouped_gemm=True`
-
-3. Fully Sharded Data Parallelism
-
-   > 1. Megatron-Bridge supports PyTorch-native FSDP. FSDP can be used in combination with per-tensor sharding methods.
-   >
-   >    > 1. To use PyTorch FSDP2:
-   >    >
-   >    >    > 1. `DistributedInitConfig.use_torch_fsdp2=True`
-   >
-   > 2. FSDP can be preferred over TP+PP+DP mappings in the following scenarios:
-   >
-   >    > 1. Small models with a large sequence, thus the parameter AllGather and gradient ReduceScatter can effectively be hidden under computation and the short communication overlap causes minor interference to the computation under overlap.
-   >    > 2. In FSDP training, activation storage remains as the main memory bottleneck because FSDP only shards model state memory, and a large per-GPU activation is needed to hide the costly FSDP communication. On GB200 GPUs, Megatron-Bridge offers an option to offload activations to the host memory via a high-speed chip-to-chip interconnect.
-   >    > 3. Baseline training is host performance-bound, but FSDP allows for larger per-GPU tensor sizes by eliminating TP or enabling a larger micro-batch size.
-
-   <!-- TODO: support megatron custom fsdp -->
-   <!-- > 1. Megatron-Bridge supports two Fully Sharded Data Parallelism (FSDP) implementations: PyTorch-native FSDP and a custom Megatron FSDP built within Megatron Core. While both follow the same sharding principles, the custom implementation is further optimized for performance. The performance gain of the custom FSDP comes primarily from minimizing the data movement to the communication tensors and reusing communication buffers. Both FSDP methods can be used in combination with per-tensor sharding methods. -->
-   <!-- > -->
-   <!-- >    > 1. To use PyTorch FSDP2: -->
-   <!-- >    > -->
-   <!-- >    >    > 1. `DistributedInitConfig.use_torch_fsdp2=True` -->
-   <!-- >    > -->
-   <!-- >    > 2. To use Custom Megatron FSDP: -->
-   <!-- >    > -->
-   <!-- >    >    > 1. `recipe.trainer.strategy.fsdp="megatron"` -->
-   <!-- >    >    > 2. `recipe.trainer.strategy.ddp.data_parallel_sharding_strategy="optim_grads_params"` -->
-   <!-- > -->
-   <!-- > 2. FSDP can be preferred over TP+PP+DP mappings in the following scenarios: -->
-   <!-- > -->
-   <!-- >    > 1. Small models with a large sequence, thus the parameter AllGather and gradient ReduceScatter can effectively be hidden under computation and the short communication overlap causes minor interference to the computation under overlap. -->
-   <!-- >    > 2. In FSDP training, activation storage remains as the main memory bottleneck because FSDP only shards model state memory, and a large per-GPU activation is needed to hide the costly FSDP communication. On GB200 GPUs, Megatron-Bridge offers an option to offload activations to the host memory via a high-speed chip-to-chip interconnect. -->
-   <!-- >    > 3. Baseline training is host performance-bound, but FSDP allows for larger per-GPU tensor sizes by eliminating TP or enabling a larger micro-batch size. -->
-
-4. Heterogeneous Encoder Parallelism
-
-   > 1. Encoder Pipeline Parallel
-   >
-   >    > 1. Use `T5ModelProvider.encoder_pipeline_model_parallel_size`.
-   >    > 2. In an Encoder-Decoder architecture like Multimodal models (VLMs like NeVA etc.), Encoder Pipeline Parallel can be used to add pipeline parallelism to the encoder.
-   >    > 3. Pipeline parallelism controls the amount of pipelining in the decoder part.
-   >    > 4. Encoder Pipeline Parallel is limited to 1 at the moment, i.e., the encoder can occupy a maximum of 1 PP stage.
-   >    > 5. By default, Encoder Pipeline Parallel is 0 and Decoder Pipeline Parallel is 1.
-   >    > 6. When the Encoder Pipeline Parallel size is 0, it shares the first PP stage of the Decoder.
-   >
-   > 2. Encoder Tensor Parallel
-   >
-   >    > 1. Use `T5ModelProvider.encoder_tensor_model_parallel_size`.
-   >    > 2. Since encoders tend to be much smaller than decoders, we also provide the ability to set a different amount of tensor parallelism to the encoder than the decoder.
-   >    > 3. By default, encoder tensor parallel is set to 0, i.e., the amount of tensor parallelism in the encoder is equal to tensor parallelism in the decoder.
-   >    > 4. To use this option, Encoder Pipeline Parallel must be greater than 0 as we need the encoder to be on its own pipeline stage.
-   >    > 5. Encoder Tensor Parallel size is limited to be less than or equal to Tensor parallel size.
-   >
-   > 3. Total number of GPUs required when these features are used is:
-   >
-   >    > 1. Data Parallel size * Context Parallel size * ((Encoder TP * Encoder PP) + (Decoder TP * Decoder PP))
-   >
-   > 4. These features are experimental and may still have bugs. There are critical bug fixes that will be made in a future release.
-
-5. Parallel mapping strategies with NVL72
-
-   > 1. Training with only data parallelism or FSDP makes it straightforward to fully utilize the bandwidth of an NVL72 system. However, when combining multiple parallelism strategies, it's important to ensure that high-volume communicators remain confined within each NVL72 domain. For example, with TP=4, DP=16, and PP=4, the GPUs in the first TP group of DP1/PP1 spans both NVLink and network domains, causing communication performance to be bottlenecked by the slower network link. To avoid this, you may choose TP and DP sizes such that the product of TP × DP divides evenly into the NVL72 configuration. If the model-parallel size does not align naturally, padding may be required to support non-divisible group sizes.
-   > 2. To avoid this partitioning complexity, you can just use 64 GPUs out of the 72 GPUs.
-
-## Communication Overlaps and Tuning
-
-1. Data-parallel communication of Distributed Optimizer
-
-   > 1. Distributed optimizer overlaps parameter AllGathers with the forward computation of the first micro-batch and gradient ReduceScatters with the backward computation of the last micro-batch.
-   >
-   >    > 1. `DistributedDataParallelConfig.overlap_param_gather=true`
-   >    > 2. `DistributedDataParallelConfig.overlap_grad_reduce=true`
-   >
-   > 2. When using the distributed optimizer with pipeline parallelism (PP) + virtual pipeline parallelism (VPP), DP communications overlap with multiple micro-batches, increasing the opportunity for effective overlap. Also, Megatron-Bridge aligns the execution timing of DP communications across pipeline-parallel ranks to synchronize the computing kernel slowdown from the overlap.
-   >
-   >    > 1. `DistributedDataParallelConfig.align_param_gather=true`
-   >
-   > 3. Slow DP communication at large scaling training:
-   >
-   >    > 1. Distributing optimizer states across a partial DP domain reduces communication costs over high-latency Ethernet networks. Model states remain replicated outside the distributed domain. During the final micro-batch backpropagation, gradient ReduceScatters occur within the distributed domain, followed by AllReduce in the non-distributed domain. Parameter AllGathers are performed only within the distributed domain.
-   >    >
-   >    >    > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances= <int>`
-   >    >
-   >    > 2. A large message size for DP communication is recommended to maximize network bandwidth utilization. You can achieve this by increasing the communication bucket size.
-   >    >
-   >    >    > 1. `DistributedDataParallelConfig.bucket_size=<number_of_elements: int>`
-   >
-   > 4. A common reason for DP communication overlap failure:
-   >
-   >    > 1. Persistent Layer Normalization (LN) kernels from Transformer Engine use spin-waiting for all SMs in the GPU, causing the LN kernel and subsequent computation kernels to be scheduled only after DP communication. To prevent this, an appropriate SM margin should be configured using the following environment variables.
-   >    >
-   >    >    > 1. `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>`
-   >    >    > 2. `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives = 16>`
-
-<!-- 2. Custom Megatron FSDP -->
-
-<!--    > 1. Unless you specify the communication bucket size, MCORE FSDP uses fixed communication overlap that overlaps the parameter AllGather and gradient ReduceScatter of each Transformer layer with its associated forward and backward computations. -->
-
-3. Tensor-parallel (TP) communication (with sequence parallelism)
-
-   > 1. Megatron-Bridge currently uses the userbuffer backend in Transformer Engine for TP communication overlaps. This offers the pipelined overlap of the TP communication with dependent computation.
-   >
-   >    > 1. `CommOverlapConfig.tp_comm_overlap`
-   >
-   > 2. The overlap method, resource, and precision of the TP communication overlaps are configurable, and the most performant configurations are set in the Megatron-Bridge training recipes by default. Also, you can set a custom TP communication overlap configuration via the below interface following the structure of TransformerLayerTPOverlapCfg class.
-   >
-   >    > 1. `CommOverlapConfig.tp_comm_overlap_cfg=<TransformerLayerTPOverlapCfg>`
-   >
-   > 3. TP communication overlap setting tips
-   >
-   >    > 1. Balancing the number of SMs between communication and GEMM
-   >    >
-   >    >    > 1. For AllGather/ReduceScatter bulk and ReduceScatter pipelined overlap, you can adjust the number of SMs to balance communication and GEMM execution. Allocating too many SMs to communication may degrade GEMM performance, while too few may expose communication overhead. The default SM allocation for communication is 16, but you can fine-tune it based on profiling results.
-   >    >    > 2. `TPOverlapCfg.num_sm=<int>`
-   >    >
-   >    > 2. CGA sizing to improve SM utilization
-   >    >
-   >    >    > 1. The CGA size can be set between 1 and 4, but it should not exceed the number of SMs allocated for communication. We recommend using CGA ≤ 2 to prevent potential SM rasterization that could impact GEMM performance.
-   >    >    > 2. `TPOverlapCfg.cga_size=<int≤4>`
-   >    >
-   >    > 3. Use 4× splits for ReduceScatter and GEMM overlap to optimize the balance between GEMM efficiency and communication exposure.
-   >    >
-   >    >    > 1. In GEMM-then-ReduceScatter pipeline overlap, a 1× ReduceScatter chunk remains exposed. A small split size increases communication exposure, while a large split size may degrade performance due to aggregated GEMM wave quantization. We find that num_splits = 4 generally provides the best performance.
-   >    >    > 2. `TPOverlapCfg.num_split=<int>`
-   >
-   > 4. Common reason for TP comm overlap failure at Hopper
-   >
-   >    > 1. At H100 GPU, an environment variable `CUDA_DEVICE_MAX_CONNECTIONS=1` should be set. Otherwise, TP communication kernels can be scheduled at the end of GEMM to overlap with.
-   >    > 2. Pipelined TP communication overlap is used by a static userbuffer registered upon model initialization. Therefore, it doesn't support activation tensors dynamically changing between steps or between Transformer layers.
-
-4. Context-parallel (CP) communication
-
-   > 1. CP communication is configurable via "cp_comm_type", which can be "p2p", "all_gather", "a2a", or "a2a+p2p". Communications of "p2p" are implemented as ring-exchange send/receive operations, and they are hard-coded to overlap with the attention compute of sequence chunks. See [Long Sequence Training](#long-sequence-train) for more details.
-
-5. Expert-parallel communication
-
-   > 1. To hide the A2A/AG communication introduced by EP, pipeline split overlap or 1F1B overlap alongside Pipeline Parallelism could be possible. It will be added to Megatron-Bridge in future releases.
-
-6. Pipeline-parallel (PP) send/receive communication
-
-   > 1. PP send/recv in steady 1F1B states are set to be overlapped with computes by default.
-   > 2. The PP send/recv in warmup and flush are exposed by default.
-
-(comm-data-types)=
-## Communication Data Types
-
-1. FP8 data-parallel parameter AllGather in Distributed Optimizer and FSDP
-
-   > 1. Megatron-Bridge supports FP8 parameter AllGather for per-tensor FP8 scaling recipes. This operation is lossless, enhancing performance while reducing memory usage.
-   >
-   >    > 1. `MixedPrecisionConfig.fp8_param=true`
-
-2. BF16 (instead of FP32) data-parallel reduction in Distributed Optimizer and FSDP
-
-   > 1. We have validated that BF16 reduction is numerically safe across numerous model training runs. However, BF16 reduction with a large data-parallel size (e.g., DP ≥ 128), especially the Ring reduction algorithm—which accumulates copies sequentially—may impact numerical stability. When using SHARP with NVIDIA InfiniBand, BF16 reduction is more robust, as it performs binary additions with higher precision for intermediate partial reductions.
-   >
-   >    > 1. `DistributedDataParallelConfig.grad_reduce_in_fp32=false`
-
-3. FP8 tensor-parallel ReduceScatter
-
-   > 1. When communication latency exceeds GEMM execution time, using FP8 input ReduceScatter can better hide communication overhead. This approach has low numerical impact, as the GEMM output must be cast to FP8 and then converted back to high precision during reduction.
-   >
-   >    > 1. `TPOverlapCfg.fp8_buf=true`
-
-4. FP8 A2A Dispatch for expert parallel communication
-
-   > 1. Megatron-Bridge is working on supporting FP8 A2A dispatch (before expert FC1), but still keeps BF16 A2A combine (after expert FC2).
-
-## Performance at Scale
-
-1. Scaling a training job is typically achieved by increasing the size of the data-parallel domain. In large-scale training, this often results in a small number of micro-batches per global batch—or even a single micro-batch—causing most computations to overlap with data-parallel communication. To maintain high performance in such scenarios, you should focus on minimizing the overhead of data-parallel communication and reducing host-driven inter-GPU jitter.
-
-2. You can lower the overhead of data-parallel communication by (1) reducing the communication precision e.g., BF16 for gradient reduction and FP8 parameter gathering, (2) improving the efficiency of communication by increasing the data-parallel communication message size or using the hierarchical data-parallel reduction, or (3) using multi-cast and switch reduction with SHARP in case of InfiniBand network.
-
-   > 1. Using BF16 gradient reduction and FP8 parameter gather are described in [Communication Data Types](#comm-data-types)
-   >
-   > 2. For non-pipeline-parallel training, the data-parallel communication bucket size can be adjusted using the knobs below. In pipeline-parallel training, however, the bucket size is fixed and determined by the number of parameters assigned to each virtual pipeline rank.
-   >
-   >    > 1. `DistributedDataParallelConfig.bucket_size=<int: bytes>`
-   >
-   > 3. Setting the knob below splits the data-parallel domain of the distributed optimizer into a sharding domain and a replication domain. Gradient reduction then occurs in two stages—one within each domain—avoiding the use of a single large flat ring for collective operations that have high latency.
-   >
-   >    > 1. `DistributedDataParallelConfig.num_distributed_optimizer_instances=<int: ≤dp_size>`
-
-3. Ideas to reduce the host-driven inter-GPU jitters are discussed in [Lowering Host Overhead and Jitters](#lowering-overhead-jitter).
-
-(lowering-overhead-jitter)=
-## Lowering Host Overhead and Jitters
-
-1. Common observation associated with host overhead
-
-   > 1. Significantly low GPU FLOPS.
-   > 2. Small performance gain of low-precision (FP8) training.
-   > 3. Small LLMs with small hidden size or sequence length or fine-tuning without sequence packing
-   > 4. High multi-GPU communication variation.
-
-2. Increasing micro-batch size and reduce per-tensor sharding
-
-   > 1. The most common way to increase per-GPU tensor size is by increasing the micro-batch size or minimizing unnecessary per-tensor sharding (e.g., TP or CP) when GPU memory permits.
-
-3. Manual garbage collection to align the host interruption across GPUs
-
-   > 1. Megatron-Bridge manually aligns the timing of garbage collection across GPUs that significantly mitigate the host overhead compared to the baseline automatic garbage collection.
-   >
-   >    > 1. `TrainingConfig.manual_gc_interval=<int>`
-
-4. CUDA graph to eliminate repeated static host code execution
-
-   > 1. Megatron-Bridge supports graph capture, significantly reducing host overhead. CUDA Graph is applicable only to LLMs with a static tensor shape across training steps. For example, it supports fixed-size packed sequences but does not handle sequences with varying lengths at each step. Also, MoE models with token-dropless propagation have limited CUDA graph support, restricted to the dense modules only.
-   > 2. CUDA graph requires additional memory for static buffer management, typically adding a few gigabytes for static buffers, while models with PP size > 1 may consume over 10GB. We are actively working to reduce this memory overhead.
-   > 3. See [CUDA Graphs](training/cuda-graphs.md) for configuration details (`cuda_graph_impl`, `cuda_graph_scope`).
-
-5. Bind CPU memory for GPU processes
-
-   > 1. Binding CPU cores to GPU processes helps mitigate long latency issues and ensures minimal variation in GPU queuing latency across GPUs. This optimization significantly impacts, particularly when the communication domain size is large.
-   > 2. Example command line for a X86-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/4)) --membind=$((SLURM_LOCALID/4)) <run script>`
-   > 3. Example command line for a Grace-based GPU system: `numactl --cpunodebind=$((SLURM_LOCALID/2)) --membind=$((SLURM_LOCALID/2)) <run script>`
-
-(reducing-memory-overflow)=
-## Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency
-
-1. Activation recomputation
-
-   > 1. Megatron-Bridge LLMs default to dot-product attention-only recomputation using Flash Attention, efficiently regenerating large intermediate activations from the attention operation with minimal computational overhead.
-   >
-   > 2. Megatron-Bridge also supports recomputing the full intermediate activations of a Transformer block, significantly reducing activation memory usage at the cost of approximately 30% additional computation. The number of Transformer blocks to recompute can be adjusted using a configurable setting.
-   >
-   >    > 1. `TransformerConfig.recompute_granuality=full`
-   >    > 2. `TransformerConfig.recompute_method=block`
-   >    > 3. `TransformerConfig.recompute_num_layers=<int:≤num_layers_in_the_model>`
-
-2. Activation offloading to host memory
-
-   > 1. Megatron-Bridge supports offloading activation memory to host memory, essential for training tasks constrained by activation memory. This is particularly useful for scenarios like (1) FSDP, where model state memory is minimized through sharding but activation memory remains high, (2) LoRA, which has frozen parameters but significant activation memory demands, and (3) the training with a large sequence length. The efficiency of activation offloading depends on both the interconnect bandwidth between the GPU and host and the host memory bandwidth. From this perspective, Grace-based systems like the GB200 enhance offloading performance by optimizing these bandwidths.
-   >
-   > 2. The following knobs should be configured to enable offloading and specify the number of Transformer layers to offload to host memory. The maximum number of layers that can be offloaded depends on host memory capacity, which may be lower when the CPU is shared among multiple GPUs.
-   >
-   >    > 1. `TransformerConfig.cpu_offloading=True`
-   >    > 2. `TransformerConfig.cpu_offloading_weights=False`
-   >    > 3. `TransformerConfig.cpu_offloading_num_layers= <int:≤activation_offload_layers>`
-   >
-   > 3. Environment variable settings to avoid resource conflict between CPU memory offloading and network communication
-   >
-   >    > 1. `NCCL_NET_GDR_LEVEL=PHB # NCCL <=2.25`
-   >    > 2. `NCCL_NET_GDR_C2C=1     # NCCL >=2.26`
-   >
-   > 4. Optimization tips
-   >
-   >    > 1. Given the ratio between activation volume and computational operations, offloading all layer activations naively can become a performance bottleneck. Optimizing performance requires tuning the number of layers to offload while balancing it with recomputation.
-
-3. Weight memory-optimized BF16 training
-
-   > 1. In BF16 training, Megatron-Bridge optimizes memory usage by storing only the BF16 remainder of the master weight copies for the next optimizer update. This is possible because BF16 data can be represented using a subset of FP32 bits, allowing Megatron-Bridge to avoid redundant storage of the FP32 portion used for BF16 representation. This is default enabled when using precision-aware optimizer in Megatron Core.
-   >
-   >    > 1. `OptimizerConfig.use_precision_aware_optimizer=True`
-
-4. Common memory usage hikes from environment variable setting
-
-   > 1. The below environment variables will (1) avoid preserving the buffers for NCCL communication and (2) disable NVLSharp when not used. Both these options lower the GPU memory usage.
-   >
-   >    > 1. `TORCH_NCCL_AVOID_RECORD_STREAMS=1`
-   >    > 2. `NCCL_NVLS_ENABLE=0`
-   >
-   > 2. While not enabled by default, you can further reduce memory usage caused by segmentation penalties by setting the env var shown below.
-   >
-   >    > 1. `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`
-
-5. Keep parameters in FP8 at FP8 training
-
-   > 1. In FP8 training, after optimizer step execution, we can keep the parameters in FP8. Compared to the baseline that keeps the intermediate weight values in BF16, FP8 parameters lower memory usage and improve communication performance. The below knob enables keeping the parameters in FP8.
-   >
-   >    > 1. `MixedPrecisionConfig.fp8_param_gather=True`
-
-## Operator Fusion
-
-1. You can control specific fusion behaviors using the following configuration knobs:
-
-   > 1. `TransformerConfig.masked_softmax_fusion=true`
-   > 2. `GPTProvider.cross_entropy_loss_fusion=true`
-   > 3. `GPTProvider.gradient_accumulation_fusion=true`
-   > 4. `TransformerConfig.bias_activation_fusion=true`
-   > 5. `TransformerConfig.bias_dropout_fusion=true`
-   > 6. `TransformerConfig.apply_rope_fusion=true`
-
-2. Megatron-Bridge offers different Flash Attention options, which can be chosen through the model config:
-
-   > 1. Let Transformer Engine decide (default): `TransformerConfig.attention_backend=AttnBackend.auto`
-   > 2. FlashAttention2: `TransformerConfig.attention_backend=AttnBackend.flash`
-   > 3. cuDNN fused attention: `TransformerConfig.attention_backend=AttnBackend.fused`
-
-(long-sequence-train)=
-## Long Sequence Training
-
-1. Problem of long sequence training
-
-   > 1. Training with long sequence length can lead to memory overflow due to the huge memory cost of activations. The problem could be solved by recomputing activations in backward, but it can impose up to ~30% overheads in each training step. Context parallelism is a better solution which splits the sequence dimension across multiple GPUs, so that each GPU only computes and saves activations of a sequence chunk. In this way, memory overflow is addressed without introducing any redundant compute.
-
-2. CP to shard activation (knob)
-
-   > 1. `TransformerConfig.context_parallel_size=<int>`
-   >
-   >    > 1. Both TP and CP can reduce activation memory overheads. It's not wise to be biased to either of them. Communications of TP and CP are overlapped by GEMM and Attention respectively. Blindly enlarging their sizes can make some communications hard to overlap. It's recommended to sweep a combination of TP+CP configs. The optimal config is expected to make full use of all related compute and do best overlapping, thereby achieving best end-to-end performance.
-   >
-   > 2. `TransformerConfig.cp_comm_type=<str> or <list of str>`
-   >
-   >    > 1. Megatron-Core provides multiple implementation variants of CP and allows you to make choices based on your specific use cases by configuring "cp_comm_type". The configuration value can be `p2p`, `all_gather`, `a2a`, or `a2a+p2p`. These communication types are compatible with each other, so they can be flexibly interleaved between transformer layers. You only need to provide a list, where each element corresponds to a layer.
-   >    > 2. `p2p`: exchanges KV sequence chunks in ring-topology. The P2P communications can be fully overlapped.
-   >    > 3. `all_gather`: inserts an all-gather before attention to get a full sequence of KV. The all-gather is exposed, but it should not impose big overheads if GQA/MQA are used, as they have very few KV heads.
-   >    > 4. `a2a`: is an implementation of DeepSpeed Ulysses. A2A communications are added before and after the attention module to gather full sequence length and further scatter heads in CP domain. A2A cannot be overlapped.
-   >    > 5. `a2a+p2p`: is a middle ground between `a2a` and `p2p`. This is useful for cases of big CP sizes, where each sequence chunk is too short to overlap P2P communications. It first does A2A in partial CP groups to gather relatively longer sequence chunks, then applies P2P implementation to the gathered chunks. It also can be helpful for hierarchical CP communications, for example A2A and P2P happen in NVLink and IBLink domains respectively.
-   >    > 6. With small and medium CP size, `p2p` is the recommended configuration because communications can be fully overlapped; "all_gather" also should work fine with GQA/MQA. As for strongly-scaling a sequence length with big CP sizes, the short chunk length can barely overlap the `p2p` communications, so `a2a+p2p` ought to be the preferred choice. `a2a` could be adopted in some cases for its simplicity. However, CP size can be restricted with "a2a" because it requires the number of attention heads to be divisible by CP size. Restricted CP size will finally limit the sequence length that can be run.
-
-3. Activation recomputation (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow))
-
-4. Activation offloading to host memory (in [Techniques for Reducing Memory to Avoid Memory Overflow and Enhance Training Efficiency](#reducing-memory-overflow))
-
-## Sequence Packing for Performant Fine-Tuning
-
-1. Dataset preparation
-
-   > 1. Fine-tuning datasets with shorter sequences of variable length can be packed into longer sequences, up to a set maximum length, for best efficiency.
-
-2. To use this feature, the microbatch size must be set to 1. In place of increasing the micro batch size, the maximum sequence length can be increased, which will effectively increase the number of individual sequences per packed sequence.
-
-3. Enabled with:
-
-   > 1. `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size=<max sequence length>`
-   > 2. `TrainingConfig.micro_batch_size=1`
-
-4. Performance benefits also include:
-
-   > 1. Inconsistent lengths between sequences in the fine-tuning dataset would reduce the computation efficiency. With a micro-batch size over 1, all sequences must be padded with empty tokens to the length of the longest one in the micro-batch. Similarly, some optimizations like CUDA graphs require uniform sequence lengths between micro-batches. Packed sequences are arranged so that the total number of tokens per packed sequence is as close to the maximum length as possible, making most processed tokens useful.
-   > 2. Likewise, when using data parallel, variance in time needed to process different batches can result in all batches needing to wait for the longest to finish-- and this variance is reduced with packed sequence.
-
-## GPU Core Clock Optimization
-
-1. Increase the clock ratio of GPU core over off-chip memory system
-
-   > 1. NVIDIA GPUs support a CPU core clock boost mode, which increases the core clock rate by reducing the off-chip memory clock rate. This is particularly beneficial for LLMs, which are typically compute throughput-bound.
-   >
-   >    > 1. `sudo nvidia-smi boost-slider --vboost 1 <run commandline>`
-
-## Profiling Options for Analysis-based Performance Tuning
-
-1. Nsight system profile
-
-   > 1. Megatron-Bridge provides an interface to enable the NVIDIA Nsight Systems profiler, which displays the GPU execution trace of all CUDA streams. You can check whether communication kernels overlap with computation kernels and adjust resource allocation to balance communication and computation. The Nsight Systems profile can be enabled using ProfilingConfig, as shown below.
-   > 2. `ProfilingConfig(use_nsys_profiler=True, profile_start_step=<int>, profile_end_step=<int>, profile_ranks=<[0,...]>)`
-
-2. Memory snapshot
-
-   > 1. Megatron-Bridge provides an interface to extract the memory snapshot that shows the memory allocation bytes, the allocation lifespan, and the function call stack. Extracting the memory snapshot can be enabled by ProfilingConfig as shown below.
-   > 2. `ProfilingConfig(record_memory_history=True, memory_snapshot_path=</path/to/store/the/output/file, profile_ranks=<[0,...]>)`
-
-## DeepEP: Common Issues and Solutions
-
-DeepEP is a communication library optimized for Mixture-of-Experts (MoE) all-to-all operations. When using DeepEP for cross-node Expert Parallelism (EP), there are several common issues related to network transport and GPU-NIC affinity that can significantly impact performance.
-
-> Note: DeepEP is best optimized for NVL8 systems such as the DGX-B200 NVL8 or DGX-H200 NVL8. For GB200 NVL72 rack-scale systems, where 72 GPUs are interconnected within the same NVLINK domain, we recommend using [HybridEP](https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) instead of DeepEP. HybridEP is maintained by NVIDIA and is specifically optimized for NVL72 rack scale systems. It is also integrated into the Megatron-core [fused all-to-all module](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.transformer.moe.fused_a2a.html) as an alternative backend under the `flex` token dispatcher.
->
-> Learn more about GB200 MoE training best practices [here](https://github.com/NVIDIA/Megatron-LM/blob/dev/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md).
-
-### 1. Why is my DeepEP not working
-
-1. What is IBGDA and why is it a problem
-
-   DeepEP achieves optimal cross-node communication performance using InfiniBand GPU Direct Async (IBGDA), which is supported by ConnectX NICs in both InfiniBand and RoCEv2 modes. However, IBGDA is not always enabled by default—it often requires cluster administrators to actively configure the system and enable GPU Direct RDMA support in the InfiniBand (or RoCEv2) fabric. If this configuration step is skipped or unsupported in the cluster environment, IBGDA may be unavailable, which can prevent DeepEP inter-node EP capability from functioning.
-
-1. Network Transport: IBGDA vs. IBRC
-
-   > 1. IBGDA (InfiniBand GPU Direct Async) requires cluster administrators to enable GPU Direct RDMA and configure the InfiniBand subsystem. Many clusters do not have IBGDA enabled by default.
-   > 2. The official DeepEP main branch has removed support for IBRC (InfiniBand Reliable Connection), which previously served as a fallback mechanism. With IBRC, a CPU proxy thread will assist in processing the EP communication, which might have performance degradation compared to IBGDA, but we find such performance degradation doesn't overshadow the benefit of enabling wideEP in production training.
-
-2. Solution: NVSHMEM 3.5 with Automatic Transport Fallback
-
-   > 1. NVSHMEM 3.5 introduces improved auto-fallback support for cross-node communication under various network configurations. It can automatically select the best available transport (IBGDA, IBRC, or other supported mechanisms) based on cluster capabilities.
-   > 2. To benefit from NVSHMEM’s auto-fallback in DeepEP:
-   >    - Download the [official NVSHMEM 3.5.19-1 release](https://github.com/NVIDIA/nvshmem/releases/tag/v3.5.19-1). You can also choose to compile it from source in your container environment; we provide such examples later in this guide.
-   >    - Switch to the [DeepEP branch with native NVSHMEM API integration](https://github.com/seth-howell/DeepEP/tree/nvshmem_native_apis). This branch enables automatic use of NVSHMEM’s fallback mechanisms without requiring any manual code modifications.
-
-### 2. GPU-NIC Affinity and Bandwidth Contention
-
-A common cause of poor DeepEP performance is incorrect GPU-to-NIC (Network Interface Card) affinity, where multiple GPUs compete for bandwidth on a single NIC. As noted in [DeepEP PR #466](https://github.com/deepseek-ai/DeepEP/pull/466), cross-node EP performance may degrade if multiple GPUs use the same NIC, due to certain GPU-NIC affinity in some clusters. This PR provides a solution by supporting the environment variable `DEEP_EP_DEVICE_TO_HCA_MAPPING` to specify GPU-to-NIC mappings so that each GPU is automatically bound to the optimal NIC for maximum DeepEP throughput.
-
-With this PR's solution, we need the following environment variables to map GPUs to NICs correctly. First, you need to find out the names of the NICs by running `ibstat`. In our example, we found the following for one RoCEv2 DGX-B200 cluster:
-```
-> ibstat | grep ^CA
-CA 'rocep145s0'
-CA 'rocep146s0'
-CA 'rocep152s0'
-CA 'rocep153s0'
-CA 'rocep198s0'
-CA 'rocep199s0'
-CA 'rocep205s0'
-CA 'rocep206s0'
-```
-
-Use the following environment variables to map GPUs to NICs. Note that `0:rocep145s0:1` is formatted as `<CUDA_device_id>:<NIC_name>:<port>` so that each GPU will only be mapped to one dedicated NIC.
-```bash
-export NVSHMEM_ENABLE_NIC_PE_MAPPING=1
-export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1"
-```
-
-### 3. Build DeepEP
-
-In this section, we provide a reference Dockerfile that shows how to build NVSHMEM 3.5 and the customized DeepEP into your container environment.
-
-Note that the following example is provided for DGX-B200 NVL8 systems, but similar ideas apply to Hopper generation as well—just change the Dockerfile accordingly. For example, you just need to change the compile target for SM90.
-
-Key points:
-
-- NVSHMEM source: https://github.com/NVIDIA/nvshmem/tree/v3.5.19-1
-- DeepEP branch that we cherry-picked with all the fixes above: https://github.com/zhongbozhu/DeepEP/tree/nvshmem_deepep_gcp
-- Example training container template for DGX-B200: https://github.com/yanring/Megatron-MoE-ModelZoo/blob/main/dockers/B200.Dockerfile 
-
-**Dockerfile**
-```bash
-FROM nvcr.io/nvidia/pytorch:25.11-py3 as base
-
-# Other dependencie you may want
-...
-
-# Dependency of IBGDA
-RUN ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
-
-# Clone DeepEP customized version 
-WORKDIR /home/dpsk_a2a
-RUN git clone https://github.com/zhongbozhu/DeepEP.git ./deepep
-RUN cd ./deepep && git checkout nvshmem_deepep_gcp && cd /home/dpsk_a2a
-
-# Clone NVSHMEM 3.5 https://github.com/NVIDIA/nvshmem
-RUN git clone --branch v3.5.19-1 https://github.com/NVIDIA/nvshmem.git ./deepep-nvshmem
-RUN cd ./deepep-nvshmem && git checkout v3.5.19-1 && cd /home/dpsk_a2a
-
-# Build nvshmem from source
-# You can also download the pre-built binary, and skip the following 
-RUN apt-get update && \
-    DEBIAN_FRONTEND=noninteractive apt-get install -y \
-        clang \
-        llvm-dev \
-        libclang-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /home/dpsk_a2a/deepep-nvshmem
-RUN mkdir -p build && mkdir -p install && \
-    cmake -S . -B build \
-    -DCMAKE_INSTALL_PREFIX=/home/dpsk_a2a/deepep-nvshmem/install \
-    -DCUDA_HOME=/usr/local/cuda \
-    -DMPI_HOME=/opt/hpcx/ompi \
-    -DMPI_C_COMPILER=/opt/hpcx/ompi/bin/mpicc \
-    -DMPI_CXX_COMPILER=/opt/hpcx/ompi/bin/mpicxx \
-    -DNVSHMEM_MPI_SUPPORT=OFF \
-    -DNVSHMEM_IBRC_SUPPORT=ON \
-    -DNVSHMEM_IBGDA_SUPPORT=ON \
-    -DNVSHMEM_IBDEVX_SUPPORT=OFF \
-    -DNVSHMEM_UCX_SUPPORT=OFF \
-    -DNVSHMEM_SHMEM_SUPPORT=OFF \
-    -DNVSHMEM_PMIX_SUPPORT=OFF \
-    -DNVSHMEM_USE_NCCL=OFF \
-    -DNVSHMEM_USE_GDRCOPY=ON \
-    -DGDRCOPY_HOME=/usr \
-    -DNVSHMEM_USE_MLX5DV=ON \
-    -DNVSHMEM_BUILD_TESTS=ON \
-    -DNVSHMEM_BUILD_EXAMPLES=ON \
-    -DNVSHMEM_BUILD_PYTHON_LIB=OFF \
-    -DNVSHMEM_BUILD_BITCODE_LIBRARY=OFF \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_CUDA_ARCHITECTURES="100" && \
-    cmake --build build -j && \
-    cmake --install build
-
-ENV NVSHMEM_DIR=/home/dpsk_a2a/deepep-nvshmem/install
-ENV LD_LIBRARY_PATH=${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH
-ENV PATH=${NVSHMEM_DIR}/bin:$PATH
-
-## Build deepep
-WORKDIR /home/dpsk_a2a/deepep
-ENV TORCH_CUDA_ARCH_LIST="10.0"
-ENV PIP_NO_BUILD_ISOLATION=1
-ENV CPATH=${CUDA_HOME}/include/cccl:$CPATH
-RUN pip install --no-build-isolation .
-
-```
-
-DeepEP provides `test_internode.py` to test and benchmark cross-node EP communication. In our experiment, when using 4 nodes of DGX-B200 (i.e., EP32), the achieved throughput for cross-EP is about 50 GB/s with IBRC. We provide an example SLURM script below for running such a test with DeepEP.
-
-In another experiment on the same cluster, with IBGDA enabled by the cluster admin, we observed approximately 10% higher inter-node performance—roughly 55 GB/s. To enable IBGDA, you need to set the environment variable `export NVSHMEM_IB_ENABLE_IBGDA=true`; there is no need to change the software version or container, because with the software provided above, both modes will work.
-
-```bash
-srun --account=<your_account> -N 4 -p batch --time 30 \
-     --ntasks-per-node=1 --gpus-per-node=8 \
-     --no-container-mount-home --container-mounts "/lustre:/lustre" \
-     --container-image <your_container_path> \
-     --mpi=none --export=ALL \
-     bash -lc '
-set -eo pipefail 
-
-# Env Var for GPU-NIC mapping
-export NVSHMEM_ENABLE_NIC_PE_MAPPING=1
-export DEEP_EP_DEVICE_TO_HCA_MAPPING="0:rocep145s0:1,1:rocep146s0:1,2:rocep152s0:1,3:rocep153s0:1,4:rocep198s0:1,5:rocep199s0:1,6:rocep205s0:1,7:rocep206s0:1"
-
-
-# 1) Expand SLURM_JOB_NODELIST and grab the first hostname
-headnode=$(python - <<PY
-import os, re
-nl = os.environ.get("SLURM_JOB_NODELIST", "") or os.environ.get("SLURM_NODELIST", "")
-if not nl:
-    print(""); raise SystemExit(0)
-m = re.match(r"^([^-\\[]+)-(\\[(.+)\\]|(\\d+))$", nl)
-if not m:
-    # no bracket/range, just print it as-is
-    print(nl); raise SystemExit(0)
-prefix = m.group(1)
-br_or_num = m.group(3) or m.group(4)
-candidates = []
-for part in br_or_num.split(","):
-    part = part.strip()
-    if "-" in part:
-        a,b = part.split("-",1)
-        # preserve zero padding
-        width = max(len(a), len(b))
-        start, end = int(a), int(b)
-        candidates.append(f"{prefix}-{start:0{width}d}")
-    else:
-        candidates.append(f"{prefix}-{part}")
-print(sorted(candidates)[0])
-PY
-)
-
-if [[ -z "$headnode" ]]; then
-  echo "Could not determine master host from SLURM_JOB_NODELIST"; exit 1
-fi
-
-# 2) Resolve to an IP that both nodes can reach (fallback to the hostname)
-if command -v getent >/dev/null 2>&1; then
-  master_ip=$(getent ahostsv4 "$headnode" | awk "{print \$1; exit}")
-else
-  master_ip=""
-fi
-MASTER_ADDR="${master_ip:-$headnode}"
-
-# 3) Export rendezvous env that matches test_internode.py expectations
-export MASTER_ADDR
-export MASTER_PORT=${MASTER_PORT:-29500}
-export WORLD_SIZE=${SLURM_NNODES:-2}   # number of nodes
-export RANK=${SLURM_NODEID:-0}         # 0..N-1 per node
-
-export OMP_NUM_THREADS=1
-python -u /home/dpsk_a2a/deepep/tests/test_internode.py
-'
-
-```
-
-
-
-
-
-
-
-
-
-
-## Index - List of Tuning Knobs
-
-- `CommOverlapConfig.tp_comm_overlap`
-- `CommOverlapConfig.tp_comm_overlap_cfg`
-- `CUDA_DEVICE_MAX_CONNECTIONS`
-- `TrainingConfig.manual_gc_interval`
-- `MixedPrecisionConfig.fp8_param`
-- `ProfilingConfig`
-- `NCCL_NET_GDR_C2C`
-- `NCCL_NET_GDR_LEVEL`
-- `NCCL_NVLS_ENABLE`
-- `NVTE_BWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives`
-- `TransformerConfig.attention_backend`
-- `AttnBackend`
-- `NVTE_FWD_LAYERNORM_SM_MARGIN=<#SM for DP collectives`
-- `PYTORCH_CUDA_ALLOC_CONF`
-- `TrainingConfig.micro_batch_size`
-- `FinetuningDatasetConfig.packed_sequence_specs.packed_sequence_size`
-- `TransformerConfig.apply_rope_fusion`
-- `TransformerConfig.bias_activation_fusion`
-- `TransformerConfig.bias_dropout_fusion`
-- `TransformerConfig.cp_comm_type`
-- `TransformerConfig.cpu_offloading`
-- `TransformerConfig.cpu_offloading_num_layers`
-- `TransformerConfig.cpu_offloading_weights`
-- `GPTProvider.cross_entropy_loss_fusion`
-- `TransformerConfig.cuda_graph_impl` / `cuda_graph_scope` (see [CUDA Graphs](training/cuda-graphs.md))
-- `MixedPrecisionConfig.fp8_param_gather`
-- `GPTProvider.gradient_accumulation_fusion`
-- `TransformerConfig.masked_softmax_fusion`
-- `TransformerConfig.recompute_granuality`
-- `TransformerConfig.recompute_method`
-- `TransformerConfig.recompute_num_layers`
-- `OptimizerConfig.use_precision_aware_optimizer`
-- `GPTProvider.account_for_embedding_in_pipeline_split`
-- `GPTProvider.account_for_loss_in_pipeline_split`
-- `TransformerConfig.context_parallel_size`
-- `DistributedDataParallelConfig.align_param_gather`
-- `DistributedDataParallelConfig.bucket_size`
-- `DistributedDataParallelConfig.bucket_size`
-- `DistributedDataParallelConfig.data_parallel_sharding_strategy`
-- `DistributedDataParallelConfig.grad_reduce_in_fp32`
-- `DistributedDataParallelConfig.num_distributed_optimizer_instances`
-- `DistributedDataParallelConfig.overlap_grad_reduce`
-- `DistributedDataParallelConfig.overlap_param_gather`
-- `T5ModelProvider.encoder_pipeline_model_parallel_size`
-- `T5ModelProvider.encoder_tensor_model_parallel_size`
-- `TransformerConfig.expert_model_parallel_size=<int>`
-- `TransformerConfig.expert_tensor_parallel_size=<int>`
-- `TransformerConfig.moe_grouped_gemm`
-- `DistributedInitConfig.use_torch_fsdp2`
-- `TransformerConfig.pipeline_model_parallel_size`
-- `TransformerConfig.tensor_model_parallel_size`
-- `TransformerConfig.virtual_pipeline_model_parallel_size`
-- `OptimizerConfig.use_distributed_optimizer`
-- `TORCH_NCCL_AVOID_RECORD_STREAMS`
-- `TPOverlapCfg.cga_size`
-- `TPOverlapCfg.fp8_buf`
-- `TPOverlapCfg.num_sm`
-- `TPOverlapCfg.num_split`
-<!-- - `garbageCollectionCallback.gc_interval_val` -->
-<!-- - `NsysPlugin` -->
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/cpu-offloading.md
-```md
-# CPU Offloading
-
-## Overview
-
-CPU Offloading in Megatron Bridge is a feature that reduces the peak memory usage of the GPU by offloading activations and inactive weights to CPU storage. Megatron Bridge supports offloading at the transformer layer level, allowing users to specify the number of transformer layers in their language model that require CPU offloading. During the forward pass, Megatron Bridge offloads activations at the optimal time and reloads them as needed during the backward pass.
-
-## Features
-
-- Supports training models with long sequence lengths by managing activation memory efficiently
-- Enables high batch sizes per GPU by offloading activation memory
-- Overlaps computation with data transfers (Host2Device and Device2Host) during offloading and reloading
-
-## Configuration
-
-CPU offloading is configured through the model provider parameters:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Basic CPU offloading configuration
-model_config = GPTModelProvider(
-    # Model architecture
-    hidden_size=4096,
-    num_layers=32,
-    
-    # CPU offloading settings
-    cpu_offloading=True,              # Enable CPU offloading
-    cpu_offloading_num_layers=16,     # Number of layers to offload (0 to num_layers-1)
-    cpu_offloading_activations=True,  # Offload activations
-    cpu_offloading_weights=True,      # Offload weights
-    
-    # ... other model parameters
-)
-```
-
-### Configuration Parameters
-
-- **`cpu_offloading`**: Set to `True` to enable CPU offloading
-- **`cpu_offloading_num_layers`**: Number of transformer layers to offload (value between 0 and total number of layers minus one)
-- **`cpu_offloading_activations`**: Whether to offload activations to CPU memory (default: `True`)
-- **`cpu_offloading_weights`**: Whether to offload inactive weights to CPU memory (default: `False`)
-- **`cpu_offloading_double_buffering`**: Enable double buffering across layers while reloading activations from CPU (default: `False`)
-
-### Offloading Strategies
-
-You can configure different combinations of offloading based on your memory requirements:
-
-#### Activations Only
-```python
-model_config = GPTModelProvider(
-    cpu_offloading=True,
-    cpu_offloading_num_layers=8,
-    cpu_offloading_activations=True,   # Offload activations
-    cpu_offloading_weights=False,      # Keep weights on GPU
-)
-```
-
-#### Weights Only
-```python
-model_config = GPTModelProvider(
-    cpu_offloading=True,
-    cpu_offloading_num_layers=8,
-    cpu_offloading_activations=False,  # Keep activations on GPU
-    cpu_offloading_weights=True,       # Offload weights
-)
-```
-
-#### Both Activations and Weights
-```python
-model_config = GPTModelProvider(
-    cpu_offloading=True,
-    cpu_offloading_num_layers=8,
-    cpu_offloading_activations=True,   # Offload activations
-    cpu_offloading_weights=True,       # Offload weights
-)
-```
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/models/README.md
-```md
-# Supported Models
-
-This directory contains documentation for all models supported by Megatron Bridge, including Large Language Models (LLMs) and Vision Language Models (VLMs). Each model documentation includes architecture details, conversion examples for Hugging Face ↔ Megatron Bridge, and links to training recipes.
-
-## Model Categories
-
-Megatron Bridge supports two main categories of models:
-
-### 🔤 Large Language Models (LLMs)
-
-Text-only models for language understanding and generation tasks.
-
-| Category | Model Count | Documentation |
-|----------|-------------|---------------|
-| **Large Language Models** | 13 models | [LLM Documentation](llm/README.md) |
-
-**Supported LLM Families:**
-
-- DeepSeek (V2, V3)
-- Gemma (2, 3)
-- GLM-4.5
-- GPT-OSS
-- LLaMA (3, Nemotron)
-- Mistral
-- Moonlight
-- Nemotron-H
-- OLMoE
-- Qwen (2, 2.5, 3, 3 MoE, 3-Next)
-
-### 🖼️ Vision Language Models (VLMs)
-
-Multimodal models that combine vision and language capabilities.
-
-| Category | Model Count | Documentation |
-|----------|-------------|---------------|
-| **Vision Language Models** | 4 models | [VLM Documentation](vlm/README.md) |
-
-**Supported VLM Families:**
-
-- Gemma 3 VL
-- Nemotron Nano V2 VL
-- Qwen (2.5 VL, 3 VL)
-
----
-
-## Quick Navigation
-
-### I want to
-
-**🔍 Find a specific LLM model**
-→ Browse [Large Language Models](llm/README.md) documentation
-
-**🖼️ Find a specific VLM model**
-→ Browse [Vision Language Models](vlm/README.md) documentation
-
-**🔄 Convert models between formats**
-→ See [Bridge Guide](../bridge-guide.md) for Hugging Face ↔ Megatron conversion
-
-**🚀 Get started with training**
-→ See [Training Documentation](../training/README.md) for training guides
-
-**📚 Understand model architectures**
-→ Each model page documents architecture-specific features and configurations
-
-**🔧 Add support for a new model**
-→ Refer to [Adding New Models](../adding-new-models.md)
-
-**📊 Use training recipes**
-→ Read [Recipe Usage](../recipe-usage.md) for pre-configured training recipes
-
----
-
-## Model Documentation Structure
-
-Each model documentation page typically includes:
-
-1. **Model Overview** - Architecture and key features
-2. **Available Variants** - Supported model sizes and configurations
-3. **Conversion Examples** - Converting between Hugging Face and Megatron formats
-4. **Training Recipes** - Links to training configurations and examples
-5. **Architecture Details** - Model-specific features and configurations
-
----
-
-## Common Tasks by Model Type
-
-### For LLM Models
-
-**Training:**
-
-- Pretraining on large corpora
-- Supervised fine-tuning (SFT)
-- Parameter-efficient fine-tuning (PEFT/LoRA)
-- Preference optimization (DPO)
-
-**Deployment:**
-
-- Export to Hugging Face format
-- Integration with inference engines
-- Model serving and deployment
-
-**Use Cases:**
-
-- Text generation
-- Question answering
-- Conversational AI
-- Code generation
-
-### For VLM Models
-
-**Training:**
-
-- Multimodal pretraining
-- Vision-language alignment
-- Fine-tuning on visual tasks
-
-**Deployment:**
-
-- Export to Hugging Face format
-- Multimodal inference
-
-**Use Cases:**
-
-- Image captioning
-- Visual question answering
-- Document understanding
-- Multimodal reasoning
-
----
-
-## Related Documentation
-
-### Getting Started
-
-- **[Main Documentation](../README.md)** - Return to main documentation
-- **[Bridge Guide](../bridge-guide.md)** - Hugging Face ↔ Megatron conversion
-- **[Bridge Tech Details](../bridge-tech-details.md)** - Technical details of the bridge system
-
-### Training Resources
-
-- **[Training Documentation](../training/README.md)** - Comprehensive training guides
-- **[Configuration Container](../training/config-container-overview.md)** - Training configuration
-- **[Parallelisms Guide](../parallelisms.md)** - Data and model parallelism strategies
-- **[Performance Guide](../performance-guide.md)** - Performance optimization
-
-### Advanced Topics
-
-- **[Adding New Models](../adding-new-models.md)** - Extending model support
-- **[Recipe Usage](../recipe-usage.md)** - Using pre-configured training recipes
-- **[Bridge RL Integration](../bridge-rl-integration.md)** - Reinforcement learning integration
-- **[PEFT](../training/peft.md)** - Parameter-efficient fine-tuning
-
----
-
-## Model Support Overview
-
-### By Architecture Type
-
-**Decoder-Only (Autoregressive):**
-
-- GPT-style models (GPT-OSS)
-- LLaMA family (LLaMA 3, LLaMA Nemotron)
-- Qwen family (Qwen 2, 2.5, 3, 3-Next)
-- Gemma family (Gemma 2, 3)
-- DeepSeek family (DeepSeek V2, V3)
-- Mistral, Moonlight, Nemotron-H, GLM-4.5
-
-**Mixture-of-Experts (MoE):**
-
-- Qwen 3 MoE, Qwen 3-Next
-- DeepSeek V2, V3
-- OLMoE
-
-**Vision-Language (Multimodal):**
-
-- Gemma 3 VL
-- Qwen 2.5 VL, Qwen 3 VL
-- Nemotron Nano V2 VL
-
-### By Provider
-
-**Meta/LLaMA:**
-
-- LLaMA 3
-
-**NVIDIA:**
-
-- LLaMA Nemotron
-- Nemotron-H
-- Nemotron Nano V2 VL
-
-**Alibaba Cloud:**
-
-- Qwen (2, 2.5, 3, 3 MoE, 3-Next)
-- Qwen VL (2.5, 3)
-
-**Google:**
-
-- Gemma (2, 3)
-- Gemma 3 VL
-
-**DeepSeek:**
-
-- DeepSeek (V2, V3)
-
-**Other:**
-
-- Mistral AI (Mistral)
-- GLM-4.5
-- GPT-OSS
-- Moonlight
-- OLMoE
-
----
-
-## Conversion Support
-
-All models support bidirectional conversion:
-
-- **Hugging Face → Megatron Bridge**: Load pretrained weights for training
-- **Megatron Bridge → Hugging Face**: Export trained models for deployment
-
-Conversion features:
-
-- Automatic architecture detection
-- Parallelism-aware conversion (TP/PP/VPP/CP/EP)
-- Streaming and memory-efficient transfers
-- Verification mechanisms for conversion accuracy
-
-Refer to the [Bridge Guide](../bridge-guide.md) for detailed conversion instructions.
-
----
-
-**Ready to explore?** Choose a model category:
-
-- [Large Language Models (LLMs)](llm/README.md)
-- [Vision Language Models (VLMs)](vlm/README.md)
-
-Or return to the [main documentation](../README.md).
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/super/slurm_pretrain.sh
-```sh
-#!/bin/bash
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ==============================================================================
-# Nemotron 3 Super Pretraining
-#
-# Nemotron 3 Super is a 120B parameter model with A12B (Active 12 Billion) architecture
-# Supports multiple parallelism configs: each "TP,PP,EP,CP,SP" runs sequentially.
-#
-# Note: The default recipe uses NVFP4 mixed precision, which requires Blackwell GPUs.
-#       For Hopper GPUs, add: mixed_precision="bf16_mixed" to CLI_OVERRIDES.
-#
-# Usage:
-#   1. Modify the #SBATCH directives below for your cluster
-#   2. Set CONTAINER_IMAGE to your container path
-#   3. Set PARALLELISM_CONFIGS (TP,PP,EP,CP,SP per entry; CP = context parallel size, 1 = disabled)
-#   4. Submit: sbatch slurm_pretrain.sh
-# ==============================================================================
-
-#SBATCH --job-name=nemotron3-super-pretrain
-#SBATCH --nodes=8
-#SBATCH --ntasks-per-node=8
-#SBATCH --gpus-per-node=8
-#SBATCH --time=24:00:00
-#SBATCH --partition=gpu
-#SBATCH --account=my_account
-#SBATCH --output=logs/nemotron3_super_pretrain_%j.out
-#SBATCH --error=logs/nemotron3_super_pretrain_%j.err
-#SBATCH --exclusive
-
-# ==============================================================================
-# CONFIGURATION
-# ==============================================================================
-
-# Workspace directory for checkpoints and results
-WORKSPACE=${WORKSPACE:-/workspace}
-
-# Model and training configurations
-MODEL_NAME=nemotron_3_super
-DATASET_NAME=mock
-SEQ_LENGTH=4096
-TRAIN_ITERS=50
-GLOBAL_BATCH_SIZE=128
-MICRO_BATCH_SIZE=1
-EVAL_ITERS=10
-LR_WARMUP_ITERS=10
-LOG_INTERVAL=1
-WANDB_PROJECT=megatron-bridge-${DATASET_NAME}
-
-# Parallelism configs: "TP,PP,EP,CP,SP" per entry
-PARALLELISM_CONFIGS=("8,1,64,1,True" "4,1,64,2,True")
-
-# Container image (required)
-CONTAINER_IMAGE=""
-# CONTAINER_IMAGE="/path/to/container.sqsh"
-
-# Container mounts (optional, space-separated)
-CONTAINER_MOUNTS=""
-# CONTAINER_MOUNTS="/data:/data /workspace:/workspace"
-
-# ==============================================================================
-# Environment Setup
-# ==============================================================================
-
-# NCCL optimizations for large-scale training
-export TORCH_NCCL_AVOID_RECORD_STREAMS=1
-export NCCL_NVLS_ENABLE=0
-
-# UV cache on shared filesystem (recommended for multi-node setups)
-# Pre-sync once before submitting jobs: UV_CACHE_DIR=/path/to/cache uv sync
-# export UV_CACHE_DIR="/path/to/shared/uv_cache"
-
-# HuggingFace cache directory (recommended for shared filesystem)
-# export HF_HOME="/path/to/shared/HF_HOME"
-
-# Authentication tokens (set these for your environment)
-# export HF_TOKEN=
-# export WANDB_API_KEY=
-
-# ==============================================================================
-# Job Execution
-# ==============================================================================
-
-echo "======================================"
-echo "Nemotron 3 Super Pretraining Job"
-echo "======================================"
-echo "Job ID: $SLURM_JOB_ID"
-echo "Nodes: $SLURM_JOB_NUM_NODES"
-echo "GPUs per node: $SLURM_GPUS_PER_NODE"
-echo "Model: $MODEL_NAME"
-echo "Parallelism configs: ${PARALLELISM_CONFIGS[*]}"
-echo "======================================"
-
-# Create logs directory if it doesn't exist
-mkdir -p logs
-
-# Require container image
-if [ -z "$CONTAINER_IMAGE" ]; then
-    echo "ERROR: CONTAINER_IMAGE must be set. Please specify a valid container image."
-    exit 1
-fi
-
-# Build srun command (shared across configs)
-SRUN_CMD="srun --mpi=pmix --container-image=$CONTAINER_IMAGE"
-if [ -n "$CONTAINER_MOUNTS" ]; then
-    SRUN_CMD="$SRUN_CMD --container-mounts=$CONTAINER_MOUNTS"
-fi
-echo "SRUN base: $SRUN_CMD"
-echo "======================================"
-
-# Run each parallelism config in sequence
-CONFIG_INDEX=0
-for CONFIG in "${PARALLELISM_CONFIGS[@]}"; do
-    IFS=',' read -r TP PP EP CP SP <<< "$CONFIG"
-    CONFIG_INDEX=$((CONFIG_INDEX + 1))
-    echo ""
-    echo "======================================"
-    echo "Config $CONFIG_INDEX/${#PARALLELISM_CONFIGS[@]}: TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP"
-    echo "======================================"
-
-    # Build CLI overrides for this config
-    CLI_OVERRIDES="\
-        model.seq_length=$SEQ_LENGTH \
-        train.train_iters=$TRAIN_ITERS \
-        train.global_batch_size=$GLOBAL_BATCH_SIZE \
-        train.micro_batch_size=$MICRO_BATCH_SIZE \
-        train.eval_iters=$EVAL_ITERS \
-        scheduler.lr_warmup_iters=$LR_WARMUP_ITERS \
-        checkpoint.save=${WORKSPACE}/results/${MODEL_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
-        logger.log_interval=$LOG_INTERVAL \
-        logger.wandb_project=$WANDB_PROJECT \
-        logger.wandb_exp_name=${MODEL_NAME}_${DATASET_NAME}_pretrain_tp${TP}_pp${PP}_ep${EP}_sp${SP}_cp${CP} \
-        dataset.sequence_length=$SEQ_LENGTH \
-        model.tensor_model_parallel_size=$TP \
-        model.pipeline_model_parallel_size=$PP \
-        model.expert_model_parallel_size=$EP \
-        model.sequence_parallel=$SP \
-        model.context_parallel_size=$CP"
-
-    CMD="uv run --no-sync python scripts/training/run_recipe.py"
-    CMD="$CMD --recipe ${MODEL_NAME}_pretrain_config"
-    CMD="$CMD $CLI_OVERRIDES"
-
-    echo "Executing command..."
-    echo $CMD
-    echo "======================================"
-
-    $SRUN_CMD bash -c "$CMD"
-    RUN_EXIT=$?
-    if [ $RUN_EXIT -ne 0 ]; then
-        echo "ERROR: Config TP=$TP, PP=$PP, EP=$EP, SP=$SP, CP=$CP failed with exit code $RUN_EXIT"
-        exit $RUN_EXIT
-    fi
-done
-
-echo "======================================"
-echo "Job completed (all ${#PARALLELISM_CONFIGS[@]} configs)"
-echo "======================================"
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/README.md
-```md
-# Large Language Models
-
-This directory contains documentation for Large Language Models (LLMs) supported by Megatron Bridge. Each model documentation includes examples for converting to/from 🤗 Hugging Face and links to training recipes.
-
-## Available Models
-
-Megatron Bridge supports the following LLM families:
-
-| Model | Documentation | Description |
-|-------|---------------|-------------|
-| **DeepSeek V2** | [deepseek-v2.md](deepseek-v2.md) | DeepSeek V2 model family |
-| **DeepSeek V3** | [deepseek-v3.md](deepseek-v3.md) | DeepSeek V3 model family |
-| **Gemma 2** | [gemma2.md](gemma2.md) | Google Gemma 2 models |
-| **Gemma 3** | [gemma3.md](gemma3.md) | Google Gemma 3 models |
-| **GLM-4.5** | [glm45.md](glm45.md) | GLM-4.5 model family |
-| **GPT-OSS** | [gpt-oss.md](gpt-oss.md) | Open-source GPT-style models |
-| **LLaMA 3** | [llama3.md](llama3.md) | Meta LLaMA 3 models |
-| **LLaMA Nemotron** | [llama-nemotron.md](llama-nemotron.md) | NVIDIA LLaMA Nemotron models |
-| **Mistral** | [mistral.md](mistral.md) | Mistral AI models |
-| **Moonlight** | [moonlight.md](moonlight.md) | Moonlight model family |
-| **Nemotron-3** | [nemotron3.md](nemotron3.md) | NVIDIA Nemotron-3 models |
-| **Nemotron-3 Super** | [nemotron3-super.md](nemotron3-super.md) | NVIDIA Nemotron-3 Super models |
-| **Nemotron-H** | [nemotronh.md](nemotronh.md) | NVIDIA Nemotron-H models |
-| **OLMoE** | [olmoe.md](olmoe.md) | OLMoE (Open Language Model - Mixture of Experts) |
-| **Qwen** | [qwen.md](qwen.md) | Alibaba Cloud Qwen model family |
-
-## Quick Navigation
-
-### I want to
-
-**🔍 Find a specific model**
-→ Browse the model list above or use the [index page](index.md)
-
-**🔄 Convert models between formats**
-→ Each model page includes conversion examples for Hugging Face ↔ Megatron Bridge
-
-**🚀 Get started with training**
-→ See [Training Documentation](../../training/README.md) for training guides
-
-**📚 Understand model architecture**
-→ Each model page documents architecture-specific features and configurations
-
-**🔧 Add support for a new model**
-→ Refer to [Adding New Models](../../adding-new-models.md)
-
-## Related Documentation
-
-- **[Models Overview](../README.md)** - Return to main models documentation
-- **[Vision Language Models](../vlm/README.md)** - VLM model documentation
-- **[Training Documentation](../../training/README.md)** - Training and customization guides
-- **[Bridge Guide](../../bridge-guide.md)** - Working with Hugging Face models
-- **[Adding New Models](../../adding-new-models.md)** - Extending model support
-
-## Model Documentation Structure
-
-Each model documentation page typically includes:
-
-1. **Model Overview** - Architecture and key features
-2. **Available Variants** - Supported model sizes and configurations
-3. **Conversion Examples** - Converting between Hugging Face and Megatron formats
-4. **Training Recipes** - Links to training configurations and examples
-5. **Architecture Details** - Model-specific features and configurations
-
----
-
-**Ready to explore?** Choose a model from the list above or return to the [main documentation](../../README.md).
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/nemotronh.md
-```md
-# Nemotron H and Nemotron Nano v2
-
-[Nemotron H](https://huggingface.co/collections/nvidia/nemotron-h) and [Nemotron Nano v2](https://huggingface.co/collections/nvidia/nvidia-nemotron-v2) are families of **hybrid SSM-Attention models** from **NVIDIA** that combine Mamba (State Space Model) layers with traditional attention layers. These models achieve strong performance while maintaining computational efficiency through their hybrid architecture.
-
-The Nemotron H family includes models from 4B to 56B parameters with 8K context length, while Nemotron Nano v2 models (9B and 12B) are optimized for edge deployment with extended 128K context support.
-
-## Model Families
-
-### Nemotron H
-- **4B**: 52 layers, 3072 hidden size, 8K context
-- **8B**: 52 layers, 4096 hidden size, 8K context  
-- **47B**: 98 layers, 8192 hidden size, 8K context
-- **56B**: 118 layers, 8192 hidden size, 8K context
-
-### Nemotron Nano v2
-- **9B**: 56 layers, 4480 hidden size, 128K context
-- **12B**: 62 layers, 5120 hidden size, 128K context
-
-All models are supported via the Bridge system with specialized configurations for hybrid SSM-Attention architecture.
-
-## Model Architecture
-
-### Common Features Across All Models
-- **Architecture**: Hybrid SSM-Attention (Mamba + Multi-Query Attention)
-- **SSM**: Mamba-2 selective state space layers
-- **Attention**: Multi-query attention with QK LayerNorm and RoPE
-- **Activation**: Squared ReLU (SwiGLU in FFN)
-- **Normalization**: RMSNorm
-- **Position Embedding**: RoPE (Rotary Position Embeddings)
-- **Hybrid Pattern**: Configurable layer-wise mixing of Mamba ("M") and Attention ("*") layers
-
-### Nemotron H 4B Specifications
-- **Parameters**: 4B
-- **Layers**: 52 (Hybrid pattern: `M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-`)
-- **Hidden size**: 3072
-- **FFN hidden size**: 12288
-- **Attention heads**: 32 query heads, 8 key-value groups
-- **KV channels**: 128
-- **Mamba heads**: 112
-- **Mamba head dim**: 64
-- **Mamba state dim**: 128
-- **Context Length**: 8K tokens
-
-### Nemotron H 8B Specifications
-- **Parameters**: 8B
-- **Layers**: 52 (Hybrid pattern: `M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-`)
-- **Hidden size**: 4096
-- **FFN hidden size**: 21504
-- **Attention heads**: 32 query heads, 8 key-value groups
-- **KV channels**: 128
-- **Mamba heads**: 128
-- **Mamba head dim**: 64
-- **Mamba state dim**: 128
-- **Context Length**: 8K tokens
-
-### Nemotron H 47B Specifications
-- **Parameters**: 47B
-- **Layers**: 98
-- **Hidden size**: 8192
-- **FFN hidden size**: 30720
-- **Attention heads**: 64 query heads, 8 key-value groups
-- **KV channels**: 128
-- **Mamba heads**: 256
-- **Mamba head dim**: 64
-- **Mamba state dim**: 256
-- **Context Length**: 8K tokens
-
-### Nemotron H 56B Specifications
-- **Parameters**: 56B
-- **Layers**: 118
-- **Hidden size**: 8192
-- **FFN hidden size**: 32768
-- **Attention heads**: 64 query heads, 8 key-value groups
-- **KV channels**: 128
-- **Mamba heads**: 256
-- **Mamba head dim**: 64
-- **Mamba state dim**: 256
-- **Context Length**: 8K tokens
-
-### Nemotron Nano 9B v2 Specifications
-- **Parameters**: 9B
-- **Layers**: 56 (Hybrid pattern: `M-M-M-MM-M-M-M*-M-M-M*-M-M-M-M*-M-M-M-M*-M-MM-M-M-M-M-M-`)
-- **Hidden size**: 4480
-- **FFN hidden size**: 15680
-- **Attention heads**: 40 query heads, 8 key-value groups
-- **KV channels**: 128
-- **Mamba heads**: 128
-- **Mamba head dim**: 80
-- **Mamba state dim**: 128
-- **Context Length**: 128K tokens
-- **Vocab size**: 131,072
-
-### Nemotron Nano 12B v2 Specifications
-- **Parameters**: 12B
-- **Layers**: 62 (Hybrid pattern: `M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M*-M-M-M-M-`)
-- **Hidden size**: 5120
-- **FFN hidden size**: 20480
-- **Attention heads**: 40 query heads, 8 key-value groups
-- **KV channels**: 128
-- **Mamba heads**: 128
-- **Mamba head dim**: 80
-- **Mamba state dim**: 128
-- **Context Length**: 128K tokens
-- **Vocab size**: 131,072
-
-## Key Features
-
-### Hybrid SSM-Attention Architecture
-- **Mamba Layers (M)**: State space model layers for efficient long-range modeling
-- **Attention Layers (*)**: Standard multi-query attention for complex reasoning
-- **Configurable Pattern**: Each model has a predefined hybrid pattern balancing efficiency and performance
-
-### Advanced Optimizations
-- **Squared ReLU Activation**: Enhanced non-linearity for better gradient flow
-- **QK LayerNorm**: Applies LayerNorm to query and key projections for training stability
-- **RoPE**: Rotary Position Embeddings with base 10000
-- **Multi-Query Attention**: Efficient attention with shared key-value heads
-- **Selective State Space**: Mamba-2 architecture with selective gating
-
-### Extended Context (Nano v2)
-- **128K Context Window**: Nemotron Nano v2 models support up to 128K tokens
-- **Efficient Long-Range Modeling**: Hybrid architecture optimized for long sequences
-
-## Conversion with 🤗 Hugging Face
-
-### Load HF → Megatron
-
-#### Nemotron H Models
-```python
-from megatron.bridge import AutoBridge
-
-# Example: Nemotron H 8B
-bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True)
-provider = bridge.to_megatron_provider()
-
-# Configure parallelism before instantiating the model
-provider.tensor_model_parallel_size = 2
-provider.pipeline_model_parallel_size = 1
-provider.context_parallel_size = 1
-provider.sequence_parallel = True
-
-provider.finalize()
-model = provider.provide_distributed_model(wrap_with_ddp=False)
-
-# Other models:
-# bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-4B-Base-8K", trust_remote_code=True)
-# bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-47B-Base-8K", trust_remote_code=True)
-# bridge = AutoBridge.from_hf_pretrained("nvidia/Nemotron-H-56B-Base-8K", trust_remote_code=True)
-```
-
-#### Nemotron Nano v2 Models
-```python
-from megatron.bridge import AutoBridge
-
-# Example: Nemotron Nano 9B v2
-bridge = AutoBridge.from_hf_pretrained("nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base", trust_remote_code=True)
-provider = bridge.to_megatron_provider()
-
-# Configure parallelism
-provider.tensor_model_parallel_size = 2
-provider.pipeline_model_parallel_size = 1
-provider.context_parallel_size = 1
-provider.sequence_parallel = True
-
-provider.finalize()
-model = provider.provide_distributed_model(wrap_with_ddp=False)
-
-# For instruct variant:
-# bridge = AutoBridge.from_hf_pretrained("nvidia/NVIDIA-Nemotron-Nano-9B-v2", trust_remote_code=True)
-
-# For 12B model:
-# bridge = AutoBridge.from_hf_pretrained("nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base", trust_remote_code=True)
-```
-
-### Export Megatron → HF
-```python
-# Convert from a Megatron checkpoint directory to HF format
-bridge.export_ckpt(
-    megatron_path="/results/nemotronh_8b/checkpoints/iter_0500000",
-    hf_path="./nemotronh-8b-hf-export",
-)
-```
-
-## Examples
-
-- Checkpoint conversion: [examples/conversion/convert_checkpoints.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py)
-- Training scripts: [examples/models/train_any_basic.py](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/train_any_basic.py)
-
-## Finetuning Recipes
-
-### Nemotron H 4B Finetuning
-
-#### LoRA Finetuning
-```python
-from megatron.bridge.recipes.nemotronh import nemotronh_4b_peft_config
-
-cfg = nemotronh_4b_peft_config(
-    tokenizer_path="nvidia/Nemotron-H-4B-Base-8K",
-    name="nemotronh_4b_lora",
-    pretrained_checkpoint="path/to/nemotronh/4b/checkpoint",
-    peft_scheme="lora",  # or "dora" for DoRA
-    train_iters=1000,
-    global_batch_size=128,
-    finetune_lr=1e-4,
-)
-```
-
-#### Full Supervised Finetuning (SFT)
-```python
-from megatron.bridge.recipes.nemotronh import nemotronh_4b_sft_config
-
-cfg = nemotronh_4b_sft_config(
-    tokenizer_path="nvidia/Nemotron-H-4B-Base-8K",
-    name="nemotronh_4b_sft",
-    pretrained_checkpoint="path/to/nemotronh/4b/checkpoint",
-    train_iters=1000,
-    global_batch_size=128,
-    finetune_lr=5e-6,  # Lower LR for full SFT
-)
-```
-
-### Nemotron H 8B Finetuning
-
-```python
-from megatron.bridge.recipes.nemotronh import nemotronh_8b_peft_config
-
-# LoRA finetuning
-cfg = nemotronh_8b_peft_config(
-    tokenizer_path="nvidia/Nemotron-H-8B-Base-8K",
-    name="nemotronh_8b_lora",
-    pretrained_checkpoint="path/to/nemotronh/8b/checkpoint",
-    peft_scheme="lora",
-    train_iters=1000,
-    global_batch_size=128,
-    finetune_lr=1e-4,
-)
-```
-
-### Nemotron H 47B Finetuning
-
-```python
-from megatron.bridge.recipes.nemotronh import nemotronh_47b_peft_config
-
-# LoRA finetuning (recommended for 47B)
-cfg = nemotronh_47b_peft_config(
-    tokenizer_path="nvidia/Nemotron-H-47B-Base-8K",
-    name="nemotronh_47b_lora",
-    pretrained_checkpoint="path/to/nemotronh/47b/checkpoint",
-    peft_scheme="lora",
-    train_iters=1000,
-    global_batch_size=128,
-    finetune_lr=1e-4,
-) 
-```
-
-### Nemotron H 56B Finetuning
-
-```python
-from megatron.bridge.recipes.nemotronh import nemotronh_56b_peft_config
-
-# LoRA finetuning (recommended for 56B)
-cfg = nemotronh_56b_peft_config(
-    tokenizer_path="nvidia/Nemotron-H-56B-Base-8K",
-    name="nemotronh_56b_lora",
-    pretrained_checkpoint="path/to/nemotronh/56b/checkpoint",
-    peft_scheme="lora",
-    train_iters=1000,
-    global_batch_size=128,
-    finetune_lr=1e-4,
-)
-```
-
-### Nemotron Nano 9B v2 Finetuning
-
-```python
-from megatron.bridge.recipes.nemotronh import nemotron_nano_9b_v2_peft_config
-
-# LoRA finetuning
-cfg = nemotron_nano_9b_v2_peft_config(
-    tokenizer_path="nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base",
-    name="nano_9b_v2_lora",
-    pretrained_checkpoint="path/to/nano/9b/v2/checkpoint",
-    peft_scheme="lora",
-    train_iters=1000,
-    global_batch_size=128,
-    seq_length=2048,  # Can use up to 128K
-    finetune_lr=1e-4,
-)
-```
-
-### Nemotron Nano 12B v2 Finetuning
-
-```python
-from megatron.bridge.recipes.nemotronh import nemotron_nano_12b_v2_peft_config
-
-# LoRA finetuning
-cfg = nemotron_nano_12b_v2_peft_config(
-    tokenizer_path="nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base",
-    name="nano_12b_v2_lora",
-    pretrained_checkpoint="path/to/nano/12b/v2/checkpoint",
-    peft_scheme="lora",
-    train_iters=1000,
-    global_batch_size=128,
-    seq_length=2048,  # Can use up to 128K
-    finetune_lr=1e-4,
-)
-```
-
-## Default Configurations
-
-### Nemotron H Models
-
-#### 4B - LoRA (1 node, 8 GPUs)
-- TP=1, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: False
-- Precision: BF16 mixed
-- Optimized for single-GPU finetuning
-
-#### 4B - Full SFT (1 node, 8 GPUs)
-- TP=1, PP=1, CP=1, LR=5e-6
-- Sequence Parallel: False
-- Precision: BF16 mixed
-
-#### 8B - LoRA (1 node, 8 GPUs)
-- TP=1, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: False
-- Precision: BF16 mixed
-
-#### 8B - Full SFT (1 node, 8 GPUs)
-- TP=2, PP=1, CP=1, LR=5e-6
-- Sequence Parallel: True
-- Precision: BF16 mixed
-
-#### 47B - LoRA (2+ nodes)
-- TP=4, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: False
-- Precision: FP8 hybrid (recommended)
-
-#### 47B - Full SFT (4+ nodes)
-- TP=8, PP=1, CP=1, LR=5e-6
-- Sequence Parallel: True
-- Precision: FP8 hybrid
-
-#### 56B - LoRA (2+ nodes)
-- TP=4, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: False
-- Precision: FP8 hybrid (recommended)
-
-#### 56B - Full SFT (4+ nodes)
-- TP=8, PP=1, CP=1, LR=5e-6
-- Sequence Parallel: True
-- Precision: FP8 hybrid
-
-### Nemotron Nano v2 Models
-
-#### 9B - LoRA (1 node, 8 GPUs)
-- TP=2, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: True
-- Precision: BF16 mixed
-- Context: Up to 128K tokens
-
-#### 9B - Full SFT (1 node, 8 GPUs)
-- TP=2, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: True
-- Precision: BF16 mixed
-
-#### 12B - LoRA (2 nodes, 16 GPUs)
-- TP=4, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: True
-- Precision: FP8 hybrid (recommended)
-- Context: Up to 128K tokens
-
-#### 12B - Full SFT (2 nodes, 16 GPUs)
-- TP=4, PP=1, CP=1, LR=1e-4
-- Sequence Parallel: True
-- Precision: FP8 hybrid
-
-## API Reference
-
-### Nemotron H
-- Nemotron H recipes: [bridge.recipes.nemotronh](../../apidocs/bridge/bridge.recipes.nemotronh.md)
-- Nemotron H model providers: [bridge.models.nemotronh](../../apidocs/bridge/bridge.models.nemotronh.md)
-
-### Nemotron Nano v2
-- Nemotron Nano v2 recipes: [bridge.recipes.nemotronh.nemotron_nano_v2](../../apidocs/bridge/bridge.recipes.nemotronh.md)
-- Nemotron Nano v2 model providers: [bridge.models.nemotronh.NemotronNanoModelProvider9Bv2](../../apidocs/bridge/bridge.models.nemotronh.md)
-
-## Performance Optimizations
-
-### Memory Efficiency
-- **Selective Recomputation**: Reduces activation memory for larger models
-- **Sequence Parallelism**: Distributes sequence dimension across GPUs (enabled for 8B+)
-- **Context Parallelism**: Support for ultra-long sequences (Nano v2)
-- **Manual GC**: Aggressive garbage collection for stable memory usage
-- **Precision-aware optimizer**: BF16/FP8 gradients with FP32 master weights
-
-### Compute Efficiency
-- **Mamba-2 Optimizations**: Efficient selective state space computations
-- **Hybrid Architecture**: Balanced mix of Mamba and Attention layers
-- **Squared ReLU**: Efficient activation function with good gradient properties
-- **RoPE Fusion**: Optional optimization for position embeddings
-- **Multi-Query Attention**: Reduced KV cache memory and compute
-
-### Hybrid Pattern Optimization
-The hybrid override pattern determines which layers use Mamba (M) vs Attention (*):
-- **Mamba layers**: Fast, memory-efficient, good for long-range dependencies
-- **Attention layers**: Better for complex reasoning and multi-token relationships
-- **Optimal patterns**: Pre-configured per model size based on extensive experimentation
-
-## Pipeline Parallelism Layouts
-
-Nemotron H models support several PP configurations with pre-defined layouts:
-- **PP=1**: No pipelining (default for most configurations)
-- **PP=2**: Supported with symmetric layer splits
-- **PP=4**: Supported for larger models (47B, 56B)
-- **VP (Virtual Pipeline)**: Supported for reducing pipeline bubbles
-
-## Hugging Face Model Cards
-
-### Nemotron H Models
-- **4B Base**: [nvidia/Nemotron-H-4B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-4B-Base-8K)
-- **8B Base**: [nvidia/Nemotron-H-8B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-8B-Base-8K)
-- **47B Base**: [nvidia/Nemotron-H-47B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-47B-Base-8K)
-- **56B Base**: [nvidia/Nemotron-H-56B-Base-8K](https://huggingface.co/nvidia/Nemotron-H-56B-Base-8K)
-
-### Nemotron Nano v2 Models
-- **9B Base**: [nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2-Base)
-- **9B Instruct**: [nvidia/NVIDIA-Nemotron-Nano-9B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-9B-v2)
-- **12B Base**: [nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-Base)
-- **12B Instruct**: [nvidia/NVIDIA-Nemotron-Nano-12B-v2](https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2)
-
-## Technical Resources
-
-### Research Papers
-- **Nemotron Technical Report**: [arXiv:2508.14444](https://arxiv.org/abs/2508.14444)
-- **Mamba-2**: [Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality](https://arxiv.org/abs/2405.21060)
-
-## Related Documentation
-
-- Recipe usage and customization: [Recipe usage](../../recipe-usage.md)
-- Training configuration: [Configuration overview](../../training/config-container-overview.md)
-- Training entry points: [Entry points](../../training/entry-points.md)
-- PEFT methods (LoRA, DoRA): [PEFT Guide](../../training/peft.md)
-
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/megatron-fsdp.md
-```md
-# Megatron FSDP
-
-Megatron FSDP is the practical fully sharded data parallel path in Megatron
-Bridge today. It shards parameters, gradients, and optimizer state across data
-parallel ranks, which can reduce model-state memory substantially compared with
-plain Distributed Data Parallel (DDP) or the distributed optimizer path.
-
-This page is the stable overview for what Megatron FSDP is, when to use it, and
-what constraints matter. For operational enablement, code anchors, and
-verification commands, see [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md).
-
-## What It Is
-
-Megatron FSDP is the Megatron-Core custom FSDP implementation exposed in Bridge
-through `use_megatron_fsdp`.
-
-Compared with other data-parallel strategies:
-
-| Feature | DDP | Distributed Optimizer | Megatron FSDP |
-|---|---|---|---|
-| Parameter Storage | Replicated | Replicated | Sharded |
-| Optimizer States | Replicated | Sharded | Sharded |
-| Gradient Communication | All-reduce | Reduce-scatter | Reduce-scatter |
-| Parameter Communication | None | All-gather (after update) | All-gather (on-demand) |
-| Memory Efficiency | Baseline | High | Highest |
-| Communication Overhead | Low | Medium | Medium-High |
-
-The practical consequence is that Megatron FSDP is most useful when model-state
-memory, rather than activation memory, is the main bottleneck.
-
-## When to Use It
-
-Megatron FSDP is a good fit when all of the following are true:
-
-- the model is too large for plain DDP or distributed optimizer
-- you want the strongest currently supported FSDP path in Bridge
-- you are willing to trade more communication for lower memory
-- you can adopt the required FSDP checkpoint format
-
-Prefer another path when:
-
-- DDP already fits comfortably and simplicity matters most
-- distributed optimizer gives enough memory relief without fully sharding
-- you are evaluating PyTorch FSDP2 for production use on this branch
-
-## Stable Requirements
-
-Megatron FSDP in Bridge requires:
-
-- `use_megatron_fsdp` to be enabled
-- checkpoint format `fsdp_dtensor`
-- standard rank initialization order
-
-The `fsdp_dtensor` format uses PyTorch DTensor and
-`torch.distributed.checkpoint` (DCP) to store sharded parameters and optimizer
-state. It is **not interchangeable** with `torch_dist` or `zarr` checkpoints —
-you cannot load an `fsdp_dtensor` checkpoint into a non-FSDP run or vice versa.
-
-`fsdp_dtensor` is compatible with 5D parallelism (TP + PP + DP + CP + EP).
-Because DCP stores DTensor placement metadata, checkpoints saved under one
-parallelism layout can be loaded under a different layout (e.g., change TP or PP
-size between runs) — DCP handles the shard remapping automatically. The one
-unsupported combination is `use_tp_pp_dp_mapping=True`, which uses an
-alternative rank-initialization order that conflicts with FSDP sharding.
-
-Important stable constraints:
-
-- `use_megatron_fsdp` and `use_torch_fsdp2` are mutually exclusive
-- `use_tp_pp_dp_mapping` is not supported with Megatron FSDP
-- legacy checkpoint formats such as `torch_dist` and `zarr` are not valid for
-  Megatron FSDP save/load
-
-When Megatron FSDP is enabled, Bridge also adjusts some settings
-automatically, including disabling `average_in_collective` and several
-buffer-reuse optimizations that do not match the FSDP path.
-
-## Compatibility and Caveats
-
-At the configuration level, Megatron FSDP is intended to work with:
-
-- tensor parallelism
-- pipeline parallelism
-- context parallelism
-- expert parallelism
-- BF16 or FP16 mixed precision
-
-However, not every combination has the same level of in-repo validation or
-performance evidence. Treat broad compatibility as code-supported first, not as
-fully benchmark-proven for every combination.
-
-Two practical caveats matter most:
-
-1. Public recipes may expose `use_megatron_fsdp` while still defaulting to a
-   non-FSDP checkpoint format. The checkpoint requirement is stable and
-   mandatory even when recipe ergonomics lag behind.
-2. FSDP reduces model-state memory, not activation memory. For long-sequence or
-   activation-bound workloads, other techniques such as context parallelism,
-   activation recomputation, or CPU offloading may still be needed.
-
-## Torch FSDP2 Status
-
-Megatron Bridge also exposes a PyTorch FSDP2 path via `use_torch_fsdp2`, but
-that path should still be treated as experimental on this branch.
-
-The stable recommendation today is:
-
-- use Megatron FSDP if you need an FSDP path in Bridge
-- do not treat FSDP2 as interchangeable with Megatron FSDP
-
-## Related Docs
-
-- [docs/training/checkpointing.md](checkpointing.md)
-- [docs/training/cpu-offloading.md](cpu-offloading.md)
-- [docs/performance-guide.md](../performance-guide.md)
-- [skills/perf-techniques/megatron-fsdp/SKILL.md](../skills/perf-techniques/megatron-fsdp/SKILL.md)
-- [skills/perf-techniques/megatron-fsdp/card.yaml](../skills/perf-techniques/megatron-fsdp/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/attention-optimizations.md
-```md
-# Attention Optimizations
-
-Megatron Bridge provides several attention optimizations to improve the efficiency and performance of transformer models. These optimizations include Flash Attention for memory efficiency, and Multi-Query Attention (MQA) and Grouped-Query Attention (GQA) for computational efficiency.
-
-## Flash Attention
-
-### Overview
-
-Flash attention is an algorithm designed to improve the efficiency of the attention mechanism in transformer models such as GPT and BERT. The attention mechanism has quadratic time and memory complexity in sequence length and can present significant runtime and memory challenges for longer sequences.
-
-Compared to the standard, non-flash algorithm, flash attention applies two techniques to lower the memory requirement and improve compute efficiency:
-
-1. **Tiling technique**: Decomposes the inputs based on the shared memory size and calculates the softmax one tile at a time. Instead of working on the entire query, key, and value tensors at once, it makes several passes at these tensors and then combines the results in a subsequent step.
-
-2. **Recomputation technique**: Stores the softmax normalization factors (linear to sequence length), instead of the softmax results (quadratic to sequence length), and uses these normalization factors to recompute the attention scores. This saves the amount of data to write to global memory and reduces both the I/O traffic between global memory and shared memory.
-
-Flash attention lowers the memory footprint and computational complexity from quadratic to linear, greatly extending the range of sequence length allowed in large language models.
-
-### Configure Flash Attention
-
-In Megatron Bridge, flash attention is configured through the `attention_backend` parameter in your model configuration. The framework supports multiple attention backends through Transformer Engine integration:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.core.transformer.enums import AttnBackend
-
-# Configure model with flash attention (default)
-model_config = GPTModelProvider(
-    attention_backend=AttnBackend.auto,  # Let TE choose the best backend (default)
-    # ... other model parameters
-)
-
-# Or explicitly specify flash attention
-model_config = GPTModelProvider(
-    attention_backend=AttnBackend.flash_attn,  # Explicitly use flash attention
-    # ... other model parameters
-)
-```
-
-### Attention Backend Options
-
-Megatron Bridge supports several attention backends through the `attention_backend` configuration:
-
-- `AttnBackend.auto`: Automatically selects the best available backend (recommended)
-- `AttnBackend.flash_attn`: Explicitly use Flash Attention implementation
-- `AttnBackend.fused_attn`: Use cuDNN fused attention (when available)
-- `AttnBackend.local`: Use local PyTorch implementation (for debugging)
-
-### Environment Variable Control
-
-For fine-grained control, you can still use environment variables to disable specific implementations:
-
-```bash
-# Disable flash attention
-export NVTE_FLASH_ATTN=0
-
-# Disable cuDNN flash attention  
-export NVTE_FUSED_ATTN=0
-```
-
-However, the recommended approach is to use the `attention_backend` configuration parameter.
-
-## Multi-query Attention (MQA) and Grouped-query Attention (GQA)
-
-**Multi-query Attention (MQA)** and **Grouped-query Attention (GQA)** are modifications of the traditional multihead attention mechanism in Transformer models. These methods improve the efficiency and effectiveness of attention mechanisms.
-
-### Overview
-
-**Multi-query Attention (MQA)**
-
-MQA treats all attention heads as a single group, reducing computational complexity and accelerating training times. It is beneficial when model scalability or limited computational resources are concerns.
-
-**Grouped-query Attention (GQA)**
-
-GQA groups the heads into clusters, each processing a subset of queries independently. This method balances the detailed focus of traditional multihead attention with the broad approach of MQA, enhancing nuanced input data processing.
-
-These attention variants offer:
-
-- **Reduced computational load**: Both methods decrease computation, beneficial for large models
-- **Increased processing speed**: Simplifying attention leads to faster training and inference
-- **Flexibility and adaptability**: Adjustments can be made based on task needs or hardware constraints
-
-### Enable MQA and GQA
-
-To use MQA or GQA in Megatron Bridge, adjust the `num_query_groups` parameter in your model configuration:
-
-#### Multi-query Attention (MQA)
-Set `num_query_groups` to 1 to treat all attention heads as a single group:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-model_config = GPTModelProvider(
-    num_attention_heads=32,
-    num_query_groups=1,  # Enables Multi-query Attention
-    # ... other model parameters
-)
-```
-
-#### Grouped-query Attention (GQA)
-Set `num_query_groups` to a number that is a divisor of the total number of attention heads (more than one but less than the total heads):
-
-```python
-model_config = GPTModelProvider(
-    num_attention_heads=32,
-    num_query_groups=8,  # Enables Grouped-query Attention (4 heads per group)
-    # ... other model parameters
-)
-```
-
-#### Regular Multihead Attention
-For regular attention, set this parameter to `None` or match it with the number of heads:
-
-```python
-model_config = GPTModelProvider(
-    num_attention_heads=32,
-    num_query_groups=None,  # Default setting for regular multihead attention
-    # Or equivalently:
-    # num_query_groups=32,  # One group per head
-    # ... other model parameters
-)
-```
-
-## Resources
-
-- [Megatron Core Attention Implementation](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/attention.py)
-- [Flash Attention Paper](https://arxiv.org/abs/2205.14135)
-- [Transformer Engine Attention Mechanisms](https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/attention/attention.html)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/parallelisms.md
-```md
-# Parallelisms Guide
-
-Megatron Bridge supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. These parallelism strategies are configured through model provider classes and leverage Megatron Core's implementation for performance and memory efficiency.
-
-## Data Parallelism
-
-Data Parallelism (DP) replicates the model across multiple GPUs. Data batches are evenly distributed between GPUs and the data-parallel GPUs process them independently. While the computation workload is efficiently distributed across GPUs, inter-GPU communication is required to keep the model replicas consistent between training steps.
-
-### Distributed Data Parallelism
-
-Distributed Data Parallelism (DDP) keeps the model copies consistent by synchronizing parameter gradients across data-parallel GPUs before each parameter update. More specifically, it sums the gradients of all model copies using all-reduce communication collectives.
-
-![Distributed Data Parallelism](images/ddp.gif)
-*Figure: Distributed Data Parallelism synchronizes gradients across multiple GPUs using all-reduce operations.*
-
-### Distributed Optimizer
-
-[Distributed optimizer](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/dist_optimizer.html) is a memory-optimized data-parallel deployment method. It shards the optimizer states and the high-precision master parameters across data-parallel GPUs instead of replicating them. At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. This approach significantly reduces the memory need of large-scale LLM training.
-
-### Enable Data Parallelism
-
-In Megatron Bridge, DDP is the default parallel deployment method. The total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group.
-
-To enable the distributed optimizer, configure the {py:class}`bridge.training.config.OptimizerConfig` and {py:class}`bridge.training.config.DistributedDataParallelConfig`
-
-```python
-from megatron.bridge.training.config import ConfigContainer, DistributedDataParallelConfig, OptimizerConfig
-
-optimizer_config = OptimizerConfig(
-    optimizer="adam",
-    lr=3e-4,
-    weight_decay=0.1,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    use_distributed_optimizer=True,
-    clip_grad=1.0,
-)
-ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True)
-
-config = ConfigContainer(
-    ddp=ddp_config,
-    optimizer=optimizer_config,
-    # ... other config parameters
-)
-```
-
-For more optimizer options, refer to the {py:class}`bridge.training.config.OptimizerConfig` API documentation.
-
-## Model Parallelism
-
-Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need for per-GPU memory. Megatron Bridge supports various model-parallel methods through Megatron Core, which can be mixed to maximize LLM training performance.
-
-### Tensor Parallelism
-
-Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads.
-
-![Tensor Parallelism Overview](images/tp1.png)
-*Figure 1: Tensor Parallelism distributes individual layer parameters across multiple GPUs.*
-
-![Tensor Parallelism Implementation](images/tp2.png)
-*Figure 2: Detailed view of how tensor parallelism splits weight matrices and synchronizes computations.*
-
-#### Enable Tensor Parallelism
-
-To enable TP in Megatron Bridge, configure the `tensor_model_parallel_size` parameter in your model provider. This parameter determines the number of GPUs among which the model's tensors are partitioned.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure model with tensor parallelism
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,  # Enable TP across 2 GPUs
-    # ... other model parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Implement Tensor Parallelism
-
-Megatron Bridge integrates TP through the implementation from Megatron Core. For detailed API usage and additional configurations, consult the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.tensor_parallel.html).
-
-### Pipeline Parallelism
-
-Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
-
-![Pipeline Parallelism](images/pp.gif)
-*Figure: Pipeline Parallelism distributes consecutive layers across multiple GPUs, processing batches in a pipeline fashion.*
-
-#### Enable Pipeline Parallelism
-
-To utilize Pipeline Parallelism in Megatron Bridge, set the `pipeline_model_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure model with pipeline parallelism
-model_config = GPTModelProvider(
-    pipeline_model_parallel_size=4,  # Distribute layers across 4 GPUs
-    # ... other model parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Interleaved Pipeline Parallel Schedule
-
-To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. Enable this by setting `virtual_pipeline_model_parallel_size`:
-
-```python
-model_config = GPTModelProvider(
-    pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=2,  # 2 model chunks per pipeline stage
-    # ... other model parameters
-)
-```
-
-For more insights into this approach, see the detailed blog: [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism).
-
-#### Implement Pipeline Parallelism
-
-The Megatron Bridge implementation of PP leverages functionalities from Megatron Core. For more detailed API usage and configurations related to PP, visit the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.pipeline_parallel.html).
-
-### Expert Parallelism and Mixture of Experts (MoE)
-
-Expert Parallelism (EP) is a type of model parallelism that distributes experts of a Mixture of Experts (MoE) model across GPUs. Unlike other model-parallel techniques, EP is applied to only the expert layers and does not impact the parallel mapping of the rest of the layers.
-
-MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert focuses on a specific subtask or domain, while a gating network dynamically activates the most appropriate expert based on the current input.
-
-![Expert Parallelism](images/ep.png)
-*Figure: Expert Parallelism distributes MoE experts across multiple GPUs while keeping other layers replicated.*
-
-#### Basic MoE Configuration
-
-To enable MoE in Megatron Bridge, configure the basic MoE parameters in your model provider:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure basic MoE model
-model_config = GPTModelProvider(
-    num_moe_experts=8,           # Number of experts in the MoE module
-    moe_router_topk=2,           # Number of experts activated per token
-    moe_ffn_hidden_size=8192,    # Hidden size for expert FFN layers
-    # ... other model parameters
-)
-```
-
-#### Enable Expert Parallelism
-
-To enable EP, set `expert_model_parallel_size` in your model configuration. For example, if the model has eight experts (`num_moe_experts=8`), then setting `expert_model_parallel_size=4` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
-
-```python
-# Configure MoE model with expert parallelism
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,  # Distribute 8 experts across 4 GPUs (2 experts per GPU)
-    # ... other model parameters
-)
-```
-
-#### Enable Expert Tensor Parallelism
-
-To enable Expert Tensor Parallelism (ETP), set `expert_tensor_parallel_size` in your model configuration:
-
-```python
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    expert_tensor_parallel_size=2,  # Apply tensor parallelism within each expert
-    # ... other model parameters
-)
-```
-
-#### Advanced MoE Features
-
-Megatron Bridge provides several advanced optimizations for MoE models to improve performance on modern GPU architectures.
-
-##### DeepEP and HybridEP Optimizations
-
-DeepEP and HybridEP are high-performance MoE token dispatchers that improve throughput and efficiency on specific GPU architectures:
-
-- **DeepEP**: Optimized for Ampere, Hopper, B200, and B300 GPUs
-- **HybridEP**: Optimized for GB200, GB300 with NVL72, and Ampere, Hopper, B200, B300 GPUs
-
-These dispatchers replace the standard token routing mechanism with an optimized "flex" dispatcher that provides better performance for MoE workloads.
-
-**Enable DeepEP:**
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    # ... other model parameters
-)
-
-# Apply DeepEP optimization
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep")
-```
-
-**Enable HybridEP:**
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    # ... other model parameters
-)
-
-# Apply HybridEP optimization
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="hybridep")
-```
-
-**GPU Architecture Requirements:**
-
-- **DeepEP**: Ampere (SM 8.x), Hopper (SM 9.x), B200, B300
-- **HybridEP**: GB200, GB300 with NVL72, Ampere (SM 8.x), Hopper (SM 9.x), B200, B300
-
-The system automatically validates GPU compatibility and issues warnings if the dispatcher is not supported on the current hardware.
-
-##### Token Dropping for Load Balancing
-
-Token dropping improves MoE performance by balancing work across experts through capacity factors. This feature allows the model to drop tokens when experts are overloaded, preventing stragglers and improving overall throughput.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    moe_router_topk=2,
-    moe_token_dispatcher_type="alltoall",  # Required for token dropping
-    moe_router_load_balancing_type="aux_loss",  # Required load balancing type
-    # ... other model parameters
-)
-
-# Apply token dropping with capacity factor
-apply_moe_token_drop(
-    model_config,
-    moe_expert_capacity_factor=1.0,  # Capacity multiplier per expert
-    moe_pad_expert_input_to_capacity=True,  # Pad inputs to capacity length
-)
-```
-
-**Configuration Parameters:**
-
-- `moe_expert_capacity_factor`: Controls the maximum number of tokens each expert can process. A factor of 1.0 means each expert can handle exactly its proportional share of tokens. Lower values (e.g., 0.8) drop more tokens but improve load balancing.
-- `moe_pad_expert_input_to_capacity`: When enabled, pads expert inputs to the capacity length for consistent batch sizes.
-
-**Requirements:**
-
-- Token dispatcher must be `alltoall` or `alltoall_seq`
-- Load balancing type must be `aux_loss`, `seq_aux_loss`, or `none`
-
-**Trade-offs:**
-
-Token dropping can improve training throughput by 10-30% in imbalanced MoE models, but may affect convergence if too aggressive. Start with a capacity factor of 1.0 and gradually reduce if needed.
-
-#### Complete MoE Configuration Example
-
-Here's a complete example showing how to configure an MoE model with advanced optimizations:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop
-
-# Configure MoE model with expert parallelism
-model_config = GPTModelProvider(
-    num_layers=32,
-    hidden_size=4096,
-    num_attention_heads=32,
-    
-    # MoE configuration
-    num_moe_experts=8,                    # 8 experts total
-    moe_router_topk=2,                    # Activate 2 experts per token
-    moe_ffn_hidden_size=8192,            # Expert FFN hidden dimension
-    moe_token_dispatcher_type="alltoall", # Token dispatcher type
-    moe_router_load_balancing_type="aux_loss",  # Load balancing
-    
-    # Expert parallelism
-    expert_model_parallel_size=4,         # Distribute experts across 4 GPUs
-    expert_tensor_parallel_size=2,        # Apply TP within each expert
-    
-    # ... other model parameters
-)
-
-# Apply DeepEP optimization (for Ampere/Hopper GPUs)
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep")
-
-# Apply token dropping for load balancing
-apply_moe_token_drop(
-    model_config,
-    moe_expert_capacity_factor=1.0,
-    moe_pad_expert_input_to_capacity=True,
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Expert Parallelism Implementation
-
-The Megatron Bridge implementation of EP uses functionality from Megatron Core. Please consult the [Megatron Core MoE layer](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/moe_layer.py#L42) for more MoE implementation details.
-
-## Activation Partitioning
-
-In LLM training, a large memory space is needed to store the input activations of the network layers. Megatron Bridge provides effective activation distribution methods through Megatron Core, which is critical in training LLMs with large sequence lengths or large per-GPU micro-batch sizes.
-
-### Sequence Parallelism
-
-Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
-
-![Sequence Parallelism](images/sp.png)
-*Figure: Sequence Parallelism distributes the sequence dimension across multiple GPUs, reducing activation memory.*
-
-#### Enable Sequence Parallelism
-
-To utilize SP in Megatron Bridge, set the `sequence_parallel` parameter to `True` in your model configuration. Note that this feature is effective only when the tensor parallel size (`tensor_model_parallel_size`) is greater than `1`.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure model with sequence parallelism
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,  # Required for sequence parallelism
-    sequence_parallel=True,        # Enable sequence parallelism
-    # ... other model parameters
-)
-```
-
-#### Implement Sequence Parallelism
-
-The Megatron Bridge implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code: [Megatron-LM Sequence Parallel Source Code](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py).
-
-### Context Parallelism
-
-Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs by partitioning the input tensors along the sequence dimension. Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers.
-
-CP is critical for training long context models, as it allows the model to handle longer sequences by distributing the sequence activations across multiple GPUs. This method reduces the memory footprint and computational cost of processing long sequences.
-
-#### Enable Context Parallelism
-
-To activate CP in Megatron Bridge, set the `context_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs across which the model's sequence activations are distributed.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure model with context parallelism
-model_config = GPTModelProvider(
-    context_parallel_size=2,  # Distribute sequence across 2 GPUs
-    # ... other model parameters
-)
-```
-
-For long context training scenarios, context parallelism is particularly effective and essential for handling sequences that exceed the memory capacity of individual GPUs.
-
-#### Implement Context Parallelism
-
-Megatron Bridge leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
-
-For more detailed technical information and implementation details, visit:
-- [Megatron Core Context Parallelism Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/context_parallel.html)
-- [Megatron Core wrappers for Transformer Engine](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py)
-- [Transformer Engine attention modules](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py)
-
-## Combined Parallelism Example
-
-Megatron Bridge allows you to combine multiple parallelism strategies for optimal performance and memory efficiency:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer, OptimizerConfig
-
-# Configure model with multiple parallelism strategies
-model_config = GPTModelProvider(
-    # Model parallelism
-    tensor_model_parallel_size=2,      # 2-way tensor parallelism
-    pipeline_model_parallel_size=4,    # 4-way pipeline parallelism
-    virtual_pipeline_model_parallel_size=2,  # Interleaved pipeline
-    
-    # Activation partitioning
-    sequence_parallel=True,            # Enable sequence parallelism (requires TP > 1)
-    context_parallel_size=2,           # 2-way context parallelism
-    
-    # Expert parallelism (for MoE models)
-    num_moe_experts=8,                 # 8 experts
-    expert_model_parallel_size=4,      # Distribute experts across 4 GPUs
-    
-    # ... other model parameters
-)
-
-# Configure distributed optimizer
-optimizer_config = OptimizerConfig(
-    optimizer="adam",
-    use_distributed_optimizer=True,    # Enable distributed optimizer
-    # ... other optimizer parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    optimizer=optimizer_config,
-    # ... other config parameters
-)
-```
-
-## Data Parallel Size Calculation
-
-The data parallel size is automatically calculated based on the total world size and model parallelism settings:
-
-```
-data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size)
-```
-
-For example, with 32 GPUs total and the configuration above:
-- `tensor_model_parallel_size = 2`
-- `pipeline_model_parallel_size = 4` 
-- `context_parallel_size = 2`
-- `data_parallel_size = 32 / (2 × 4 × 2) = 2`
-
-## Strategy Selection Guide
-
-Choosing the right combination depends on model size, hardware topology,
-and sequence length.
-
-### Dense Models by Size
-
-| Model size | GPUs | Recommended starting point |
-|---|---|---|
-| < 1B | 1-8 | DP only |
-| 1-10B | 8-16 | TP=2-4 + DP |
-| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP |
-| 70-175B | 64-256 | TP=8 + PP=4-8 + DP |
-| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP |
-
-### MoE Models
-
-MoE models differ fundamentally from dense models: only a fraction of
-parameters are active per token, so TP can often stay at 1 or 2. EP is
-the primary scaling dimension.
-
-| Total / active params | Typical layout |
-|---|---|
-| < 20B | EP only (TP=1, PP=1) |
-| 20-100B | TP=1-2 + PP=2-4 + EP=8-16 |
-| 100-500B | TP=2-4 + PP=8-16 + EP=8-32 |
-| 500B+ | TP=2 + PP=16 + EP=32-64 |
-
-### By Hardware Topology
-
-- **Single node with NVLink**: maximize TP within the node (up to TP=8).
-- **Multiple nodes with InfiniBand**: keep TP within a node, use PP across nodes.
-- **Limited network (Ethernet)**: minimize TP, prefer PP for cross-node scaling.
-
-### By Sequence Length
-
-| Sequence length | Recommendation |
-|---|---|
-| < 2K | standard TP + PP + DP |
-| 2K-8K | add SP (`sequence_parallel=True`) |
-| 8K-32K | add CP=2 |
-| 32K+ | add CP=4-8, consider hierarchical CP |
-
-For operational details on configuring combined parallelism, troubleshooting
-layouts, and memory estimation, see the
-[parallelism strategies skill](skills/perf-techniques/parallelism-strategies/SKILL.md).
-
-## Configuration Guidelines
-
-### Memory Optimization
-- Use **distributed optimizer** to reduce optimizer state memory
-- Enable **sequence parallelism** when using tensor parallelism to reduce activation memory
-- Use **context parallelism** for long sequence training
-- Consider **pipeline parallelism** for very large models that don't fit on a single GPU
-
-### Performance Optimization
-- **Tensor parallelism** works best within a single node (high bandwidth)
-- **Pipeline parallelism** can work across nodes but requires careful batch size tuning
-- **Context parallelism** is essential for long context scenarios
-- **Expert parallelism** is specific to MoE models and should match the number of experts
-- **DeepEP/HybridEP** provide optimized MoE token dispatching on supported GPU architectures
-
-### Compatibility
-- **Sequence parallelism** requires `tensor_model_parallel_size > 1`
-- **Expert parallelism** requires MoE models (`num_moe_experts > 0`)
-- **DeepEP** requires Ampere, Hopper, B200, or B300 GPUs
-- **HybridEP** requires GB200, GB300 with NVL72, or Ampere, Hopper, B200, B300 GPUs
-- **Token dropping** requires `alltoall` or `alltoall_seq` token dispatcher
-- All parallelism strategies can be combined, but total parallelism must divide evenly into the world size
-
-## Related Artifacts
-
-- **Operational skill**: [skills/perf-techniques/parallelism-strategies/SKILL.md](skills/perf-techniques/parallelism-strategies/SKILL.md) — enablement, pitfalls, memory estimation, verification
-- **Knowledge card**: [skills/perf-techniques/parallelism-strategies/card.yaml](skills/perf-techniques/parallelism-strategies/card.yaml) — structured metadata and validation status
-
-## Resources
-
-- [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/)
-- [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/)
-- [Megatron-LM Repository](https://github.com/NVIDIA/Megatron-LM)
-- [Transformer Engine](https://github.com/NVIDIA/TransformerEngine)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/README.md
-```md
-# Nemotron 3 Examples
-
-This directory contains example scripts for Nemotron 3 language models:
-
-| Model | Parameters | Active Parameters | Subdirectory |
-|-------|-----------|-------------------|--------------|
-| Nemotron 3 Nano | 30B | A3B | [nano/](nano/) |
-| Nemotron 3 Super | 120B | A12B | [super/](super/) |
-
-## Workspace Configuration
-
-All scripts use a `WORKSPACE` environment variable to define the base directory for checkpoints and results. By default, this is set to `/workspace`. You can override it:
-
-```bash
-export WORKSPACE=/your/custom/path
-```
-
-Directory structure:
-- `${WORKSPACE}/models/` - Converted checkpoints
-- `${WORKSPACE}/results/` - Training outputs and experiment results
-
-## Checkpoint Conversion
-
-Each model has its own conversion script: [nano/conversion.sh](nano/conversion.sh), [super/conversion.sh](super/conversion.sh).
-
-## Training Recipes
-
-Available recipes:
-
-**Nano** ([source](../../../src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py)):
-- `nemotron_3_nano_pretrain_config`: Pretraining
-- `nemotron_3_nano_sft_config`: Supervised fine-tuning
-- `nemotron_3_nano_peft_config`: PEFT with LoRA support
-
-**Super** ([source](../../../src/megatron/bridge/recipes/nemotronh/nemotron_3_super.py)):
-- `nemotron_3_super_pretrain_config`: Pretraining
-- `nemotron_3_super_sft_config`: Supervised fine-tuning
-- `nemotron_3_super_peft_config`: PEFT with LoRA support
-
-Before training, ensure the following are configured:
-1. **Container Image**: Set `CONTAINER_IMAGE` in the SLURM scripts to your container path
-2. **Container Mounts**: (optional) Set `CONTAINER_MOUNTS` for data and workspace directories
-3. **Environment Variables**:
-   - `HF_TOKEN`: to download models from HF Hub (if required)
-   - `HF_HOME`: (optional) to avoid re-downloading models and datasets
-   - `WANDB_API_KEY`: (optional) to enable WandB logging
-
-All training scripts use SLURM for containerized multi-node training.
-
-### Nano
-
-See the SLURM scripts in [nano/](nano/): [slurm_pretrain.sh](nano/slurm_pretrain.sh), [slurm_sft.sh](nano/slurm_sft.sh), [slurm_peft.sh](nano/slurm_peft.sh).
-
-### Super
-
-See the SLURM scripts in [super/](super/): [slurm_pretrain.sh](super/slurm_pretrain.sh), [slurm_sft.sh](super/slurm_sft.sh), [slurm_peft.sh](super/slurm_peft.sh).
-
-## Evaluation
-
-Coming soon.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_super.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-
-from megatron.bridge import AutoBridge
-from megatron.bridge.peft.base import PEFT
-from megatron.bridge.peft.lora import LoRA
-from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common
-from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
-from megatron.bridge.training.config import ConfigContainer
-
-
-NEMOTRON_3_SUPER_HF_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16"
-
-
-def nemotron_3_super_pretrain_config() -> ConfigContainer:
-    """Return a pre-training config for Nemotron 3 Super (120B-A12B LatentMoE).
-
-    This is a Latent MoE model with Multi-Token Prediction (MTP). Default parallelism:
-    - TP=4, PP=1, EP=8, SP=True
-
-    Returns:
-        ConfigContainer: Pre-training configuration for Nemotron 3 Super.
-    """
-    cfg = _pretrain_common()
-
-    # Model Configuration (LatentMoE with MTP) — derived from HF config via AutoBridge
-    cfg.model = AutoBridge.from_hf_pretrained(NEMOTRON_3_SUPER_HF_MODEL_ID).to_megatron_provider(load_weights=False)
-
-    # Parallelism Settings
-    cfg.model.tensor_model_parallel_size = 4
-    cfg.model.pipeline_model_parallel_size = 1
-    cfg.model.pipeline_dtype = torch.bfloat16
-    cfg.model.virtual_pipeline_model_parallel_size = None
-    cfg.model.context_parallel_size = 1
-    cfg.model.sequence_parallel = True
-    cfg.model.expert_tensor_parallel_size = 1
-    cfg.model.expert_model_parallel_size = 8
-    cfg.model.pipeline_model_parallel_layout = None
-    cfg.model.seq_length = 8192
-
-    # Tokenizer (--tokenizer-model)
-    cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-
-    # Dataset Configuration
-    cfg.dataset.seq_length = 8192
-    cfg.dataset.blend = None
-    cfg.dataset.num_workers = 1
-    cfg.dataset.mmap_bin_files = False
-
-    # MoE Token Dispatcher Settings
-    cfg.model.moe_token_dispatcher_type = "alltoall"
-    cfg.model.moe_shared_expert_overlap = False
-    cfg.model.moe_flex_dispatcher_backend = "hybridep"
-
-    # Training Configuration
-    cfg.train.train_iters = 39735
-    cfg.train.global_batch_size = 3072
-    cfg.train.micro_batch_size = 1
-    cfg.train.manual_gc = False
-    cfg.train.manual_gc_interval = 0
-
-    # Validation
-    cfg.validation.eval_interval = 1000
-
-    # Transformer Engine (TE)
-    cfg.model.transformer_impl = "transformer_engine"
-
-    # CUDA Graph (TE impl + partial scopes: ~40% throughput gain over disabled)
-    cfg.model.cuda_graph_impl = "transformer_engine"
-    cfg.model.cuda_graph_scope = ["attn", "mamba", "moe_router", "moe_preprocess"]
-    cfg.model.cuda_graph_warmup_steps = 3
-
-    # Kernel Selections
-    cfg.model.attention_backend = "fused"
-    cfg.model.cross_entropy_fusion_impl = "te"
-    cfg.model.use_te_rng_tracker = True
-
-    # MTP Settings (HF config has num_nextn_predict_layers=1 for the shared block;
-    # mtp_num_layers=2 controls forward-pass repetitions with mtp_use_repeated_layer)
-    cfg.model.mtp_num_layers = 2
-    cfg.model.keep_mtp_spec_in_bf16 = True
-    cfg.model.calculate_per_token_loss = True
-    cfg.model.mtp_loss_scaling_factor = 0.3
-    cfg.model.mtp_use_repeated_layer = True
-
-    # Mixed Precision
-    cfg.mixed_precision = "nemotron_3_super_bf16_with_nvfp4_mixed"
-
-    # Optimizer hyperparameters
-    cfg.optimizer.lr = 4.5e-4
-    cfg.optimizer.min_lr = 4.5e-6
-    cfg.optimizer.weight_decay = 0.1
-    cfg.optimizer.adam_beta1 = 0.9
-    cfg.optimizer.adam_beta2 = 0.95
-    cfg.optimizer.adam_eps = 1e-8
-    cfg.scheduler.lr_warmup_iters = 333
-    cfg.scheduler.start_weight_decay = 0.1
-    cfg.scheduler.end_weight_decay = 0.1
-    cfg.scheduler.lr_decay_style = "WSD"
-
-    # Checkpoint Configuration
-    cfg.checkpoint.save_interval = 200
-    cfg.checkpoint.ckpt_assume_constant_structure = True
-    cfg.checkpoint.dist_ckpt_strictness = "log_all"
-    cfg.checkpoint.async_save = True
-
-    # DDP Configuration
-    cfg.ddp.overlap_grad_reduce = True
-    cfg.ddp.overlap_param_gather = True
-    cfg.ddp.check_for_nan_in_grad = True
-    cfg.ddp.use_distributed_optimizer = True
-    cfg.ddp.average_in_collective = False
-
-    cfg.model.init_method_std = 0.014
-    cfg.model.apply_rope_fusion = False
-    cfg.model.gradient_accumulation_fusion = True
-    cfg.model.use_fused_weighted_squared_relu = True
-
-    return cfg
-
-
-# =============================================================================
-# SFT Config
-# =============================================================================
-
-
-def nemotron_3_super_sft_config() -> ConfigContainer:
-    """Return a full SFT config for Nemotron 3 Super (120B-A12B LatentMoE).
-
-    Default parallelism: TP=1, PP=1, EP=8, SP=True
-
-    Returns:
-        ConfigContainer with all settings pre-configured for Nemotron 3 Super SFT.
-    """
-    cfg = _sft_common()
-
-    # Model config — derived from HF config via AutoBridge
-    cfg.model = AutoBridge.from_hf_pretrained(NEMOTRON_3_SUPER_HF_MODEL_ID).to_megatron_provider(load_weights=False)
-
-    # Parallelism settings
-    cfg.model.tensor_model_parallel_size = 1
-    cfg.model.pipeline_model_parallel_size = 1
-    cfg.model.pipeline_dtype = torch.bfloat16
-    cfg.model.virtual_pipeline_model_parallel_size = None
-    cfg.model.context_parallel_size = 1
-    cfg.model.sequence_parallel = True
-    cfg.model.expert_tensor_parallel_size = 1
-    cfg.model.expert_model_parallel_size = 8
-    cfg.model.pipeline_model_parallel_layout = None
-    cfg.model.seq_length = 2048
-
-    # Training-specific model overrides
-    cfg.model.apply_rope_fusion = False
-    cfg.model.attention_backend = "fused"
-    cfg.model.gradient_accumulation_fusion = True
-    cfg.model.init_method_std = 0.014
-    cfg.model.use_fused_weighted_squared_relu = True
-    cfg.model.calculate_per_token_loss = True
-
-    # MoE Token Dispatcher Settings
-    cfg.model.moe_token_dispatcher_type = "alltoall"
-    cfg.model.moe_shared_expert_overlap = False
-    cfg.model.moe_flex_dispatcher_backend = "hybridep"
-
-    # CUDA Graph disabled — packed-sequence SFT passes explicit attention masks that
-    # are incompatible with CUDA graph capture/replay in Mamba layers.
-    cfg.model.cuda_graph_impl = "none"
-    cfg.model.cuda_graph_scope = []
-
-    # MTP Settings (HF config has num_nextn_predict_layers=1 for the shared block;
-    # mtp_num_layers=2 controls forward-pass repetitions with mtp_use_repeated_layer)
-    cfg.model.mtp_num_layers = 2
-    cfg.model.keep_mtp_spec_in_bf16 = True
-    cfg.model.mtp_loss_scaling_factor = 0.3
-    cfg.model.mtp_use_repeated_layer = True
-    cfg.model.use_te_rng_tracker = True
-
-    # Optimizer overrides
-    cfg.optimizer.lr = 5e-6
-    cfg.optimizer.adam_beta1 = 0.9
-    cfg.optimizer.adam_beta2 = 0.95
-    cfg.optimizer.adam_eps = 1e-8
-    cfg.optimizer.weight_decay = 0.1
-    cfg.scheduler.start_weight_decay = 0.1
-    cfg.scheduler.end_weight_decay = 0.1
-    cfg.scheduler.lr_decay_style = "cosine"
-
-    # Tokenizer
-    cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-
-    # Checkpoint config overrides
-    cfg.checkpoint.save_interval = 200
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.dist_ckpt_strictness = "log_all"
-    cfg.checkpoint.ckpt_assume_constant_structure = True
-    cfg.checkpoint.async_save = True
-
-    # Logger config
-    cfg.logger.log_interval = 10
-
-    # RNG config
-    cfg.rng.seed = 1234
-
-    # DDP config
-    cfg.ddp.check_for_nan_in_grad = True
-    cfg.ddp.grad_reduce_in_fp32 = True
-    cfg.ddp.overlap_grad_reduce = True
-    cfg.ddp.overlap_param_gather = True
-    cfg.ddp.use_distributed_optimizer = True
-
-    return cfg
-
-
-# =============================================================================
-# PEFT Config
-# =============================================================================
-
-
-def nemotron_3_super_peft_config(
-    peft_scheme: str | PEFT = "lora",
-) -> ConfigContainer:
-    """Return a PEFT config for Nemotron 3 Super (120B-A12B LatentMoE).
-
-    Default parallelism: TP=1, PP=1, EP=1, SP=True
-
-    Args:
-        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
-
-    Returns:
-        ConfigContainer with all settings pre-configured for Nemotron 3 Super PEFT.
-    """
-    cfg = _peft_common()
-
-    # Model config — derived from HF config via AutoBridge
-    cfg.model = AutoBridge.from_hf_pretrained(NEMOTRON_3_SUPER_HF_MODEL_ID).to_megatron_provider(load_weights=False)
-
-    # Parallelism settings
-    cfg.model.tensor_model_parallel_size = 1
-    cfg.model.pipeline_model_parallel_size = 1
-    cfg.model.pipeline_dtype = torch.bfloat16
-    cfg.model.virtual_pipeline_model_parallel_size = None
-    cfg.model.context_parallel_size = 1
-    cfg.model.sequence_parallel = True
-    cfg.model.expert_tensor_parallel_size = 1
-    cfg.model.expert_model_parallel_size = 1
-    cfg.model.pipeline_model_parallel_layout = None
-    cfg.model.seq_length = 2048
-
-    # Training-specific model overrides
-    cfg.model.apply_rope_fusion = False
-    cfg.model.attention_backend = "fused"
-    cfg.model.gradient_accumulation_fusion = True
-    cfg.model.init_method_std = 0.014
-    cfg.model.use_fused_weighted_squared_relu = True
-    cfg.model.calculate_per_token_loss = True
-
-    # MoE Token Dispatcher Settings
-    cfg.model.moe_token_dispatcher_type = "alltoall"
-    cfg.model.moe_shared_expert_overlap = False
-    cfg.model.moe_flex_dispatcher_backend = "hybridep"
-
-    # CUDA Graph disabled — packed-sequence SFT passes explicit attention masks that
-    # are incompatible with CUDA graph capture/replay in Mamba layers.
-    cfg.model.cuda_graph_impl = "none"
-    cfg.model.cuda_graph_scope = []
-
-    # MTP Settings (HF config has num_nextn_predict_layers=1 for the shared block;
-    # mtp_num_layers=2 controls forward-pass repetitions with mtp_use_repeated_layer)
-    cfg.model.mtp_num_layers = 2
-    cfg.model.keep_mtp_spec_in_bf16 = True
-    cfg.model.mtp_loss_scaling_factor = 0.3
-    cfg.model.mtp_use_repeated_layer = True
-    cfg.model.use_te_rng_tracker = True
-
-    # PEFT config - Nemotron uses Mamba-specific target modules
-    mamba_target_modules = ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"]
-    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
-        cfg.peft = default_peft_config(peft_scheme, target_modules=mamba_target_modules)
-    elif isinstance(peft_scheme, PEFT):
-        cfg.peft = peft_scheme
-    else:
-        cfg.peft = LoRA(
-            target_modules=mamba_target_modules,
-            dim=32,
-            alpha=32,
-            dropout=0.0,
-            dropout_position="pre",
-            lora_A_init_method="xavier",
-            lora_B_init_method="zero",
-        )
-
-    # Optimizer overrides
-    cfg.optimizer.lr = 1e-4
-    cfg.optimizer.adam_beta1 = 0.9
-    cfg.optimizer.adam_beta2 = 0.95
-    cfg.optimizer.adam_eps = 1e-8
-    cfg.optimizer.weight_decay = 0.1
-    cfg.scheduler.start_weight_decay = 0.1
-    cfg.scheduler.end_weight_decay = 0.1
-    cfg.scheduler.lr_decay_style = "cosine"
-
-    # Tokenizer
-    cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-
-    # Checkpoint config overrides
-    cfg.checkpoint.save_interval = 200
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.dist_ckpt_strictness = "log_all"
-    cfg.checkpoint.ckpt_assume_constant_structure = True
-    cfg.checkpoint.async_save = True
-
-    # Logger config
-    cfg.logger.log_interval = 10
-
-    # RNG config
-    cfg.rng.seed = 1234
-
-    # DDP config
-    cfg.ddp.check_for_nan_in_grad = True
-    cfg.ddp.grad_reduce_in_fp32 = True
-    cfg.ddp.overlap_grad_reduce = True
-    cfg.ddp.overlap_param_gather = True
-    cfg.ddp.use_distributed_optimizer = True
-
-    return cfg
-
-
-__all__ = [
-    "nemotron_3_super_pretrain_config",
-    "nemotron_3_super_sft_config",
-    "nemotron_3_super_peft_config",
-]
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/cuda-graphs.md
-```md
-# CUDA Graphs
-
-CUDA graphs capture a sequence of GPU operations once and replay them with
-minimal host overhead, reducing repeated kernel-launch and driver costs on
-every training step.
-
-This page is the stable guide for what CUDA graphs are, when they help, and
-what tradeoffs to expect. For exact enablement knobs, code anchors, and
-verification commands, see `skills/perf-techniques/cuda-graphs/SKILL.md`.
-
-## What It Is
-
-CUDA graphs record a fixed sequence of GPU work during a capture phase and then
-replay that sequence on later steps. The main benefit is lower host-side
-launch overhead.
-
-Megatron Bridge supports two capture implementations:
-
-| `cuda_graph_impl` | Mechanism | Scope support |
-|---|---|---|
-| `"local"` | MCore `CudaGraphManager` / `FullCudaGraphWrapper` | `full_iteration` |
-| `"transformer_engine"` | TE `make_graphed_callables()` per layer | `attn`, `mlp`, `moe`, `moe_router`, `moe_preprocess`, `mamba` |
-| `"none"` (default) | Disabled | — |
-
-`"local"` captures the whole forward-backward iteration. `"transformer_engine"`
-captures selected submodules and is usually the more flexible default path.
-
-## What Problem It Solves
-
-CUDA graphs mainly solve launch-bound training steps where GPU compute is fast
-enough that repeated host-driver submission overhead becomes noticeable.
-
-This is most useful when:
-
-- tensor shapes are static across steps
-- the workload has high step frequency or relatively small kernels
-- the run has enough memory headroom to keep graph buffers resident
-
-It is less about changing the math and more about reducing runtime overhead.
-
-## Impacted Training Dimensions
-
-| Dimension | Effect | Confidence | Why |
-|---|---|---|---|
-| `speed` | ~15-30% faster step time | medium | Replays pre-captured GPU work and reduces launch overhead. Measured 16-24% on GPT-OSS-20B and 22% on Qwen3-30B-A3B with TE-scoped graphs. Gain depends on how launch-bound the workload is. |
-| `memory` | ~0-2 GB extra (TE scoped); 10 GB+ possible with `PP > 1` or large MoE | high | Graph buffers stay allocated for replay. TE-scoped showed no measurable increase on 20B/30B models but OOM'd on 120B at 70/79 GB. |
-| `scale` | neutral to slightly positive | low | Can help at scale if launch overhead matters, but memory overhead can gate larger configs (e.g., GPT-OSS-120B OOM). |
-| `convergence` | no change expected | medium | Intended to preserve training math when capture constraints are satisfied. Loss matched within 0.001 on Qwen3-30B-A3B over 20 iterations. |
-| `stability` | adds operational constraints | medium | Requires static shapes, specific RNG/NaN settings, and compatible scope selections. Failure modes are well-defined but add surface area. |
-
-## When to Use It
-
-Enable CUDA graphs when all of the following are mostly true:
-
-- sequence length and micro-batch size are static
-- host overhead is a meaningful part of step time
-- the run has spare memory budget
-- you want throughput improvement without changing the training objective
-
-As a rule of thumb:
-
-- prefer `transformer_engine` scoped graphs for the safer first rollout
-- use `local` `full_iteration` graphs only when you specifically want the
-  largest launch-overhead reduction and can accept the stricter constraints
-
-## When Not to Use It
-
-Avoid CUDA graphs when any of these are true:
-
-- sequence length or batch shapes vary step to step
-- CPU offloading is enabled
-- memory is already tight, especially with `PP > 1`
-- you rely on runtime checks that conflict with `full_iteration` capture
-- you need unsupported scope combinations for MoE or recompute paths
-- SFT/LoRA with packed sequences (`packed_sequence=True`) — TE-scoped graphs
-  cannot capture `packed_seq_params` (non-Tensor input)
-- full activation recompute (`recompute_granularity=full`) with TE-scoped
-  graphs — only `local` full-iteration graphs support full recompute
-
-## Feature Interactions
-
-The most important interactions are:
-
-- `use_te_rng_tracker` and `rng.te_rng_tracker`: required when CUDA graphs are enabled
-- `rerun_state_machine.check_for_nan_in_loss`: must be disabled for `local` + `full_iteration`
-- MoE routing scopes: `moe` and `moe_router` are mutually exclusive
-- `moe_preprocess`: requires `moe_router`
-- `delay_wgrad_compute`: adds extra constraints when captured scopes include attention or MoE router
-- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True`: requires `NCCL_GRAPH_REGISTER=0` in the relevant path
-- CPU offloading: incompatible
-
-These interactions are stable enough to treat as design constraints, not just
-debugging tips.
-
-## Bridge Configuration
-
-Minimal high-level configuration:
-
-```python
-cfg.model.cuda_graph_impl = "transformer_engine"   # or "local"
-cfg.model.cuda_graph_scope = ["attn"]              # or other valid scopes
-cfg.model.cuda_graph_warmup_steps = 3
-cfg.model.use_te_rng_tracker = True
-cfg.rng.te_rng_tracker = True
-```
-
-If you use `local` + `full_iteration`, also disable:
-
-```python
-cfg.rerun_state_machine.check_for_nan_in_loss = False
-cfg.ddp.check_for_nan_in_grad = False
-```
-
-## Minimal Runnable Example
-
-For a minimal Bridge-facing example, start from the functional smoke test:
-
-- `tests/functional_tests/recipes/test_llama_recipes_pretrain_cuda_graphs.py`
-
-For a lightweight CLI-driven path, use the performance harness with scoped
-capture and a small model recipe.
-
-## Expected Metric Changes
-
-| Metric | Expected Change | Conditions | Evidence |
-|---|---|---|---|
-| `step_time` | ~15-25% down | Static shapes, MoE, TE scoped (`attn+moe_router+moe_preprocess`) | measured: Qwen3-30B-A3B 623→484ms; GPT-OSS-20B 467-520→391-399ms |
-| `tokens_per_sec` | ~20-33% up | Same as above | measured: Qwen3-30B-A3B 214→274 TFLOP/s/GPU; GPT-OSS-20B 37.9-42.2→49.4-50.4 |
-| `peak_memory` | same pre-capture | TE scoped graphs on H100 80GB | measured: no increase in allocated memory on Qwen3-30B-A3B and GPT-OSS-20B |
-| `OOM risk` | up | Tight memory budget or large MoE configs | measured: GPT-OSS-120B blocked at ~70/79 GB before capture |
-
-Do not assume a fixed throughput gain across models. The improvement depends on
-how launch-bound the workload is and how much scope is captured.
-
-## Measured Results (Qwen3-30B-A3B MoE, H100, TP2 PP2 EP4, 2 nodes)
-
-### Pretrain
-
-TE-scoped CUDA graphs (`attn + moe_router + moe_preprocess`) on Qwen3-30B-A3B
-with mock data, GBS=8, MBS=1:
-
-- **~22% faster** iteration time (484ms vs 623ms steady-state)
-- **~28% higher TFLOP/s** (274 vs 214 TFLOP/s/GPU)
-- **Loss matches** baseline within 0.001 across all 20 iterations
-- 24 graphable layers per pipeline rank, capture completes in ~5.6s
-- No memory increase pre-capture, no NCCL errors
-
-### SFT (packed sequences)
-
-SFT with packed sequences (`packed_sequence=True`, SQuAD dataset) hits a
-hard incompatibility:
-
-```
-AssertionError: CUDA graph accepts only Tensor inputs.
-inference_context and packed_seq_params are excluded from input list.
-```
-
-TE-scoped CUDA graphs require all forward inputs to be Tensors. Packed
-sequence SFT passes `packed_seq_params` (a dataclass), which is not captured.
-The baseline SFT runs fine without graphs (~880ms/iter).
-
-Workarounds: disable packing, or use `local` full-iteration graphs. Also make
-sure the TE/container build actually supports the packed-sequence attention
-backend your recipe needs.
-
-## Additional Validation (GPT-OSS, H100, Mar 2026)
-
-### GPT-OSS-20B pretrain
-
-TE-scoped CUDA graphs on `gpt-oss-20b` with `TP2 PP4 EP4 CP1`, 2 nodes, and
-mock data:
-
-- capture succeeds with 6 graphable layers per pipeline rank; capture completes
-  in ~0.95s
-- steady-state iteration time improves by ~16-24% (467-520ms to 391-399ms)
-- throughput improves by ~19-33% (37.9-42.2 to 49.4-50.4 TFLOP/s/GPU)
-- the pre-capture memory report is unchanged and the 20-iteration run completes
-  without NCCL or illegal-memory-access errors
-- loss comparison is inconclusive: the first ~10 post-capture iterations are
-  close, but the run used mock data, `GBS=4`, and a production LR, so later
-  divergence is too noisy to treat as a correctness signal
-
-A cleaner loss-match pass should lower LR and/or raise GBS before drawing
-equivalence conclusions.
-
-### GPT-OSS-20B SFT and LoRA
-
-Both packed-sequence finetuning workloads were blocked in the
-`mbridge-260128.sqsh` container before any CUDA-graph-specific behavior could
-be isolated:
-
-- baseline and graphed runs both fail with no TE attention backend available
-  for the packed-sequence path
-- treat this as an environment/container blocker first, not as proof that CUDA
-  graphs are or are not the root cause
-- after upgrading TE/container support, these workloads still need separate
-  validation because packed-sequence plus TE-scoped graphs remains a sensitive
-  combination
-
-### GPT-OSS-120B pretrain
-
-`gpt-oss-120b` pretrain at `TP2 PP4 EP8`, 4 nodes, hits OOM on iteration 2:
-
-- iteration 1 already uses ~69-70 GB allocated and ~72-73 GB reserved on 79 GB
-  H100s
-- the failure is a `torch.OutOfMemoryError` on an additional 1.54 GiB
-  allocation
-- treat larger MoE rollouts as memory-gated even before capture benefits are
-  realized; more PP or different memory settings may be needed
-
-## Common Failure Modes
-
-- Missing TE RNG tracker settings causes an assertion before training starts.
-- Dynamic sequence or batch shapes break capture or replay assumptions.
-- `local` `full_iteration` graphs fail when NaN-loss checking is still enabled.
-- Illegal scope combinations such as `moe` with `moe_router` fail validation.
-- Runs that fit in eager mode can OOM after enabling graphs because buffers stay pinned.
-- Full activation recompute (`recompute_granularity=full`) with TE-scoped graphs
-  asserts: `full recompute is only supported with full iteration CUDA graph`.
-  Disable recompute or switch to `local` implementation.
-- Packed-sequence SFT/LoRA asserts: `CUDA graph accepts only Tensor inputs.
-  inference_context and packed_seq_params are excluded from input list.`
-  TE-scoped graphs cannot capture non-Tensor forward arguments.
-- Older TE/container builds can fail packed-sequence attention before graph
-  capture begins (`Available backends = {FlashAttention=False,
-  FusedAttention=False, UnfusedDotProductAttention=False}`). In that case the
-  baseline and graph runs are both blocked, so fix the environment first.
-
-## Related Docs
-
-- [Performance Guide](../performance-guide.md)
-- [Communication Overlap](communication-overlap.md)
-- `skills/perf-techniques/cuda-graphs/SKILL.md`
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/checkpointing.md
-```md
-# Checkpointing
-
-The {py:class}`bridge.training.config.CheckpointConfig` controls model checkpointing behavior, including saving and loading checkpoints, checkpoint formats, and various optimization features.
-
-```{Note}
-This documentation covers **Megatron-format checkpoints** used during training. For converting between 🤗 Hugging Face and Megatron formats, see the {doc}`../bridge-guide`.
-```
-
-## Overview
-
-Megatron Bridge uses Megatron Core's distributed checkpointing system, which is designed for large-scale training across multiple GPUs and nodes. The distributed checkpoint approach saves the state of a distributed training job by sharding checkpoint data across multiple files, reducing memory overhead and improving GPU utilization during save/load operations.
-
-### Distributed Checkpointing Benefits
-
-**Memory Efficiency**: Instead of gathering all model parameters and optimizer states on a single rank, distributed checkpointing saves data directly from each rank, significantly reducing memory requirements during checkpointing.
-
-**Parallelism Flexibility**: The system provides flexibility to resume training using different parallelism strategies. You can change tensor parallelism, pipeline parallelism, or data parallelism sizes between checkpoint save and load operations.
-
-**Scalability**: Handles all types of parallelism including:
-- **Data Parallelism (DP)**: Replicates the model across multiple GPUs with different data batches
-- **Tensor Parallelism (TP)**: Distributes individual layer parameters across GPUs  
-- **Pipeline Parallelism (PP)**: Assigns consecutive layers to different GPUs
-- **Context Parallelism (CP)**: Shards tensors along the sequence dimension for long sequences
-- **Expert Parallelism (EP)**: Distributes MoE expert weights across GPUs
-
-**Performance**: The distributed optimizer shards optimizer states and master parameters across data-parallel ranks instead of replicating them, reducing memory usage and communication overhead.
-
-
-## Save Configuration
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `save` | `Optional[str]` | `None` | Output directory to save checkpoints to **in Megatron format** |
-| `save_interval` | `Optional[int]` | `None` | Number of iterations between persistent checkpoint saves |
-| `save_optim` | `bool` | `True` | Whether to save optimizer state |
-| `save_rng` | `bool` | `True` | Whether to save random number generator state |
-| `save_tokenizer_assets` | `bool` | `True` | Whether to save tokenizer files (vocab, config, special tokens) to checkpoint |
-
-### Asynchronous Saving
-
-Asynchronous saving allows training to continue while checkpoint data is persisted to disk in the background, reducing the impact of checkpointing on training throughput.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `async_save` | `bool` | `False` | Enable asynchronous checkpoint saving (requires `torch_dist` format) |
-
-## Load Configuration
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `load` | `Optional[str]` | `None` | Directory containing a model checkpoint to load **in Megatron format** |
-| `load_optim` | `bool` | `True` | Whether to load optimizer state from checkpoint |
-| `load_rng` | `bool` | `True` | Whether to load random number generator state from checkpoint |
-| `load_main_params_from_ckpt` | `bool` | `False` | Load main parameters from checkpoint (use with `load_optim=False`) |
-| `ckpt_step` | `Optional[int]` | `None` | Specific checkpoint iteration to load (overrides latest from tracker) |
-| `exit_on_missing_checkpoint` | `bool` | `False` | Exit if specified checkpoint is not found instead of random initialization |
-| `dist_ckpt_strictness` | `Literal[...]` | `"assume_ok_unexpected"` | Handling of key mismatches during distributed checkpoint load |
-
-### Loading Specific Checkpoint Iterations
-
-By default, Megatron Bridge loads the **latest checkpoint** available in the specified directory by reading from the tracker file (`latest_train_state.pt`). However, you can explicitly load from a specific checkpoint iteration using the `ckpt_step` parameter.
-
-**Python API:**
-```python
-from megatron.bridge.training.config import CheckpointConfig
-
-# Load latest checkpoint
-checkpoint = CheckpointConfig(
-    load="/path/to/checkpoint_dir"
-)
-
-# Load specific iteration
-checkpoint = CheckpointConfig(
-    load="/path/to/checkpoint_dir",
-    ckpt_step=5000  # Overrides tracker, loads iter_0005000
-)
-```
-
-```{note}
-The `load` parameter should always point to the base checkpoint directory (not the `iter_N` subdirectory). The `ckpt_step` parameter overrides which iteration is loaded from that directory.
-
-**Important:** If `ckpt_step` is specified but the checkpoint directory does not exist, training will **fail immediately** with a `FileNotFoundError`. This is intentional to prevent accidentally starting training from scratch when you meant to resume from a specific checkpoint.
-
-**PEFT Note:** The `ckpt_step` parameter applies **only to the `load` path** (adapter checkpoints), not to `pretrained_checkpoint` (frozen base model). When resuming PEFT training:
-- `pretrained_checkpoint`: Always loads the latest/release checkpoint (base model)
-- `load` + `ckpt_step`: Can load a specific adapter checkpoint iteration
-
-
-### Checkpoint Loading Strictness
-
-When loading distributed checkpoints, there may be mismatches between the keys in the saved checkpoint and what the current model expects. This can happen when resuming training with different parallelism settings, model configurations, or software versions. The `dist_ckpt_strictness` parameter controls how these mismatches are handled:
-
-- **`assume_ok_unexpected`**: Assume unexpected keys are acceptable (default, most permissive)
-- **`log_unexpected`**: Log unexpected keys but continue loading
-- **`log_all`**: Log all key mismatches for debugging
-- **`raise_unexpected`**: Raise error on unexpected keys (stricter validation)
-- **`raise_all`**: Raise error on any key mismatch (strictest validation)
-- **`return_unexpected`**: Return information about unexpected keys
-- **`return_all`**: Return information about all key mismatches
-- **`ignore_all`**: Ignore all key mismatches completely
-
-## Fine-tuning Configuration
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `pretrained_checkpoint` | `Optional[str]` | `None` | Directory containing pretrained model checkpoint **in Megatron format** for fine-tuning |
-
-## Checkpoint Format
-
-Megatron Bridge supports multiple checkpoint formats optimized for different use cases:
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `ckpt_format` | `Literal["torch_dist", "zarr", "fsdp_dtensor"]` | `"torch_dist"` | Checkpoint format to use |
-
-### Available Formats
-
-**`torch_dist`** (Default)
-- PyTorch distributed checkpoint format
-- Compatible with most parallelism strategies (DP, TP, PP, CP, EP)
-- Supports asynchronous saving when `async_save=True`
-- Recommended for general use
-
-**`zarr`**
-- Zarr-based checkpoint format
-- Alternative to `torch_dist` for certain use cases
-- Compatible with distributed parallelism strategies
-
-**`fsdp_dtensor`**
-- Specialized format for Megatron FSDP (Fully Sharded Data Parallel)
-- **Required when using `use_megatron_fsdp=True`**
-- Optimized for sharded parameter layouts
-- Not compatible with other FSDP implementations
-
-### Format Selection
-
-Choose your checkpoint format based on your training configuration:
-
-```python
-from megatron.bridge.training.config import CheckpointConfig
-
-# Standard distributed training (DDP, TP, PP)
-checkpoint = CheckpointConfig(
-    ckpt_format="torch_dist",  # Default, works for most cases
-    save="/path/to/checkpoints",
-)
-
-# Megatron FSDP training
-checkpoint = CheckpointConfig(
-    ckpt_format="fsdp_dtensor",  # Required for FSDP
-    save="/path/to/checkpoints",
-)
-```
-
-### Format Compatibility
-
-| Format | DDP | Distributed Optimizer | Megatron FSDP | Torch FSDP2 | Async Save |
-|--------|-----|----------------------|---------------|-------------|------------|
-| `torch_dist` | ✅ | ✅ | ❌ | ✅ | ✅ |
-| `zarr` | ✅ | ✅ | ❌ | ✅ | ❌ |
-| `fsdp_dtensor` | ❌ | ❌ | ✅ | ❌ | ❌ |
-
-**Important**: When using Megatron FSDP (`use_megatron_fsdp=True`), you must set `ckpt_format="fsdp_dtensor"`. Other formats are not compatible with FSDP's sharded parameter layout. See {doc}`megatron-fsdp` for complete FSDP configuration details.
-
-## Performance Optimizations
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `fully_parallel_save` | `bool` | `True` | Apply full save parallelization across data parallel ranks |
-| `fully_parallel_load` | `bool` | `False` | Apply full load parallelization across data parallel ranks |
-| `ckpt_assume_constant_structure` | `bool` | `False` | Assume constant model/optimizer structure over successive checkpoint saves for performance optimizations |
-
-
-## Checkpoint Contents
-
-The checkpoint includes the following components when using the `torch_dist` checkpoint format:
-- **Model parameters and optimizer states**: Stored across `.distcp` files to support distributed training.
-- **Training state**: Captures the current iteration count, number of consumed samples, and the state of the learning rate scheduler.
-- **Configuration**: Serialized as a YAML file (`run_config.yaml`) containing the complete `ConfigContainer`.
-- **Tokenizer files**: All tokenizer artifacts (vocabulary, special tokens, config) for self-contained checkpoints.
-- **Dataloader states**: Ensures deterministic resumption of data iteration.
-- **Metadata**: Used for validating and correctly loading the checkpoint.
-
-Megatron Bridge creates checkpoints with the following directory structure:
-
-```
-checkpoint_dir/
-├── latest_train_state.pt                      # Latest training state (top-level)
-├── iter_N/                                    # Checkpoint at iteration N
-│   ├── __0_0.distcp                          # Distributed checkpoint shards: maps to PyTorch DCP weights format
-│   ├── __0_1.distcp                          # Contains model parameters, optimizer states
-│   ├── __1_0.distcp
-│   ├── __1_1.distcp
-│   ├── ...
-│   ├── .metadata                             # PyTorch DCP checkpoint metadata
-│   ├── common.pt                             # MCore dist ckpt states saved from rank 0
-│   ├── metadata.json                         # MCore dist ckpt metadata
-│   ├── run_config.yaml                       # Serialized ConfigContainer
-│   ├── train_state.pt                        # Number of steps, consumed samples, etc
-│   ├── tokenizer/                            # Tokenizer files (saved by default)
-│   │   ├── tokenizer.json                   # Full tokenizer vocabulary
-│   │   ├── tokenizer_config.json            # Tokenizer configuration
-│   │   ├── special_tokens_map.json          # Special token definitions
-│   │   └── ...                              # Other tokenizer artifacts
-│   ├── dataloader_state/                     # Data iterator states
-│   │   ├── train_dataloader_dprank000.pt    # DP rank 0 dataloader state
-│   │   ├── train_dataloader_dprank001.pt    # DP rank 1 dataloader state
-│   │   ├── train_dataloader_dprank002.pt    # DP rank 2 dataloader state
-│   │   └── ...                              # One file per DP rank
-```
-
-### Tokenizer Assets
-
-By default, Megatron Bridge saves all tokenizer files to the checkpoint directory, making checkpoints self-contained and portable. This is particularly important for:
-- **Inference and evaluation**: Direct access to tokenizer for computing logprobs
-- **Portability**: No dependency on original tokenizer file locations
-- **Reproducibility**: Exact tokenizer state is preserved
-
-The tokenizer files saved depend on the tokenizer type:
-- **HuggingFace tokenizers**: `tokenizer.json`, `tokenizer_config.json`, `special_tokens_map.json`, and vocab files
-- **SentencePiece tokenizers**: `tokenizer.model` file
-- **GPT2 BPE tokenizers**: `vocab.json` and `merges.txt`
-- **BERT tokenizers**: `vocab.txt`
-- **Tiktoken tokenizers**: `tokenizer.json`
-
-To disable tokenizer asset saving for performance-sensitive scenarios:
-
-```python
-from megatron.bridge.training.config import CheckpointConfig
-
-checkpoint = CheckpointConfig(
-    save_tokenizer_assets=False,  # Skip tokenizer file saving
-    ...
-)
-```
-
-Or in YAML:
-
-```yaml
-checkpoint:
-  save_tokenizer_assets: false
-```
-
-## Local Checkpointing
-
-Local checkpointing saves model checkpoints directly to storage on each node (e.g., local SSDs or RAM disks), instead of relying solely on a shared network filesystem. This approach can significantly speed up the saving process and reduce the load on shared storage infrastructure.
-
-Local checkpointing leverages the [NVIDIA Resiliency Extension](https://nvidia.github.io/nvidia-resiliency-ext/checkpointing/local/index.html) and provides several key features:
-
-- **Local Saving**: Each node saves its part of the checkpoint locally, reducing network I/O and improving save performance.
-- **Synchronous and Asynchronous Support**: Saving can happen synchronously or asynchronously, mirroring the configuration used for global checkpoints.
-- **Automatic Cleanup**: Handles the removal of outdated or incomplete local checkpoints automatically.
-- **Optional Replication**: For multi-node jobs, checkpoints are replicated to other nodes to allow recovery even if a node fails after saving. Single-node jobs do not use replication.
-- **Automated Loading**: When resuming, the framework automatically finds the latest valid checkpoint, comparing local and global checkpoints, and retrieves any needed parts across nodes.
-### Non-Persistent Checkpointing Configuration
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `non_persistent_save_interval` | `Optional[int]` | `None` | Iterations between non-persistent saves |
-| `non_persistent_ckpt_type` | `Optional[Literal["global", "local", "in_memory", "None"]]` | `None` | Type of non-persistent checkpointing |
-| `non_persistent_global_ckpt_dir` | `Optional[str]` | `None` | Directory for global non-persistent checkpoints |
-| `non_persistent_local_ckpt_dir` | `Optional[str]` | `None` | Directory for local non-persistent checkpoints |
-| `non_persistent_local_ckpt_algo` | `Literal["fully_parallel", "atomic"]` | `"fully_parallel"` | Algorithm for local non-persistent checkpointing |
-
-### Replication and Fault Tolerance
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `replication` | `bool` | `False` | Enable replication of local checkpoints across ranks |
-| `replication_jump` | `Optional[int]` | `None` | Spacing between ranks storing replicas |
-| `replication_factor` | `int` | `2` | Number of machines storing replica of each rank's data |
-
-### Checkpointing Distributed Optimizer
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `dist_ckpt_optim_fully_reshardable` | `bool` | `False` | Make optimizer distributed checkpoint fully reshardable (TP/PP/EP/DP) as opposed to plain DP reshardability |
-| `distrib_optim_fully_reshardable_mem_efficient` | `bool` | `False` | Use as little memory as possible during save and load by using Gloo. Has affect only with `dist_ckpt_optim_fully_reshardable` flag |
-
-## Custom Checkpoint Manager
-
-For advanced use cases, you can provide a custom checkpoint manager implementation to override the default save/load behavior. This enables integration with custom storage backends, alternative checkpoint formats, or organization-specific checkpointing workflows.
-
-### Configuration
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `custom_manager_class` | `str \| None` | `None` | Fully qualified class name for a custom `CheckpointManager` implementation |
-
-### Usage
-
-Specify a custom checkpoint manager class in your configuration:
-
-**YAML:**
-```yaml
-checkpoint:
-  save: /path/to/checkpoints
-  custom_manager_class: "mypackage.checkpoint.MyCheckpointManager"
-```
-
-**Python:**
-```python
-from megatron.bridge.training.config import CheckpointConfig
-
-checkpoint = CheckpointConfig(
-    save="/path/to/checkpoints",
-    custom_manager_class="mypackage.checkpoint.MyCheckpointManager",
-)
-```
-
-### Implementing a Custom Manager
-
-Your custom manager must implement the `CheckpointManager` protocol defined in `megatron.bridge.training.checkpointing`:
-
-```python
-from megatron.bridge.training.checkpointing import (
-    CheckpointManager,
-    CheckpointSaveContext,
-    CheckpointLoadContext,
-    save_checkpoint,
-    load_checkpoint,
-    init_checkpointing_context,
-)
-from megatron.bridge.training.config import CheckpointConfig
-from megatron.bridge.training.state import GlobalState
-
-
-class MyCheckpointManager:
-    """Custom checkpoint manager example."""
-
-    def __init__(self, checkpoint_config: CheckpointConfig) -> None:
-        self.checkpoint_config = checkpoint_config
-        # Initialize internal context for caching strategies
-        self._context = init_checkpointing_context(checkpoint_config)
-
-    def save(self, ctx: CheckpointSaveContext) -> None:
-        """Save a checkpoint with custom logic."""
-        # Option 1: Completely custom implementation
-        # my_custom_save(ctx.state, ctx.model, ...)
-
-        # Option 2: Wrap the default implementation
-        save_checkpoint(
-            state=ctx.state,
-            model=ctx.model,
-            optimizer=ctx.optimizer,
-            opt_param_scheduler=ctx.opt_param_scheduler,
-            num_floating_point_operations_so_far=ctx.num_floating_point_operations_so_far,
-            checkpointing_context=self._context,
-            non_persistent_ckpt=ctx.non_persistent_ckpt,
-            train_data_iterator=ctx.train_data_iterator,
-        )
-        # Add custom post-processing (e.g., upload to cloud)
-        upload_to_s3(ctx.state.cfg.checkpoint.save)
-
-    def load(self, ctx: CheckpointLoadContext) -> tuple[int, int]:
-        """Load a checkpoint with custom logic."""
-        # Returns (iteration, num_floating_point_operations_so_far)
-        return load_checkpoint(
-            state=ctx.state,
-            model=ctx.model,
-            optimizer=ctx.optimizer,
-            opt_param_scheduler=ctx.opt_param_scheduler,
-            strict=ctx.strict,
-            checkpointing_context=self._context,
-            skip_load_to_model_and_opt=ctx.skip_load_to_model_and_opt,
-        )
-
-    def finalize_async_saves(
-        self, state: GlobalState, blocking: bool = False, terminate: bool = False
-    ) -> None:
-        """Finalize any pending asynchronous saves."""
-        from megatron.bridge.training.checkpointing import maybe_finalize_async_save
-
-        maybe_finalize_async_save(
-            global_state=state,
-            ckpt_cfg=self.checkpoint_config,
-            blocking=blocking,
-            terminate=terminate,
-        )
-```
-
-### Context Dataclasses
-
-The save and load methods receive context dataclasses that bundle all required parameters:
-
-**`CheckpointSaveContext`:**
-| Field | Type | Description |
-|-------|------|-------------|
-| `state` | `GlobalState` | Global training state (config, train_state, loggers) |
-| `model` | `list[MegatronModule]` | Model modules to save |
-| `optimizer` | `MegatronOptimizer \| None` | Optimizer instance |
-| `opt_param_scheduler` | `Any \| None` | Learning rate scheduler |
-| `num_floating_point_operations_so_far` | `int` | Cumulative FLOPs |
-| `train_data_iterator` | `Any \| None` | Data iterator (optional) |
-| `non_persistent_ckpt` | `bool` | Whether this is a non-persistent checkpoint |
-
-**`CheckpointLoadContext`:**
-| Field | Type | Description |
-|-------|------|-------------|
-| `state` | `GlobalState` | Global training state |
-| `model` | `list[MegatronModule]` | Model modules to load into |
-| `optimizer` | `MegatronOptimizer \| None` | Optimizer instance |
-| `opt_param_scheduler` | `Any \| None` | Learning rate scheduler |
-| `strict` | `bool` | Enforce strict loading (default: `True`) |
-| `skip_load_to_model_and_opt` | `bool` | Skip loading into model/optimizer (default: `False`) |
-
-### Limitations
-
-The custom checkpoint manager is designed for customizing the save/load **operations** during training. The following limitations apply:
-
-**Checkpoint format compatibility**: Custom managers that change the checkpoint directory structure or metadata files (e.g., `latest_train_state.pt`, `run_config.yaml`) are not well supported. Many utilities in Megatron Bridge assume the standard Megatron checkpoint format. For instance, HuggingFace ↔ custom format conversion is not supported.
-
-**PEFT with custom checkpoints**: When using PEFT (Parameter-Efficient Fine-Tuning), the `pretrained_checkpoint` path must point to a Megatron-format checkpoint. The custom manager only applies to the training save/load flow (the `save` and `load` configuration paths), not to base model loading for PEFT.
-
-**Inference loading**: Loading checkpoints for inference via `model_load_save.py` utilities is undefined behavior with custom checkpoint formats. Use your custom format's loading utilities instead.
-
-### Default Behavior
-
-When `custom_manager_class` is not set, Megatron Bridge uses `DefaultCheckpointManager`, which wraps the existing `save_checkpoint` and `load_checkpoint` functions. This ensures full backward compatibility—the checkpoint manager abstraction introduces no changes to existing training workflows.
-
-## Related Documentation
-
-- {doc}`megatron-fsdp` - Megatron FSDP configuration and `fsdp_dtensor` format requirements
-- {doc}`../parallelisms` - Understanding data and model parallelism strategies
-- {doc}`config-container-overview` - Complete configuration reference
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/gpt_step.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from functools import partial
-from typing import Iterable
-
-import modelopt.torch.distill as mtd
-import torch
-from megatron.core import parallel_state
-from megatron.core.models.gpt import GPTModel
-from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage
-from megatron.core.utils import (
-    get_batch_on_this_cp_rank,
-    get_model_config,
-    is_te_min_version,
-    unwrap_model,
-)
-
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.losses import masked_next_token_loss
-from megatron.bridge.training.post_training.distillation import loss_func_kd
-from megatron.bridge.training.state import GlobalState
-from megatron.bridge.training.utils.packed_seq_utils import get_packed_seq_params
-from megatron.bridge.training.utils.pg_utils import get_pg_collection
-
-
-logger = logging.getLogger(__name__)
-
-
-def _partition_packed_batch_for_cp(batch: dict[str, torch.Tensor], cp_size: int) -> dict[str, torch.Tensor]:
-    """Partition THD/packed batches across context-parallel ranks.
-
-    Uses transformer_engine's `thd_get_partitioned_indices` to slice sequence
-    dimension aligned with packed cu_seqlens. This avoids the generic
-    `get_batch_on_this_cp_rank` slicing which assumes contiguous sequence tokens.
-    """
-
-    err_msg = "Please update Transformer Engine to >= 1.10 to use Context Parallel with THD format data"
-    try:
-        import transformer_engine_torch as tex
-
-        if not is_te_min_version("1.10.0"):
-            logger.error(err_msg)
-            raise RuntimeError(err_msg)
-    except ModuleNotFoundError as e:
-        logger.error(err_msg)
-        raise e
-
-    cp_rank = parallel_state.get_context_parallel_rank()
-    cu_seqlens = batch["cu_seqlens"]
-    if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1:
-        raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)")
-    cu_seqlens = cu_seqlens.squeeze()
-    cu_seqlens_unpadded = batch.get("cu_seqlens_unpadded")
-    if cu_seqlens_unpadded is not None:
-        batch["cu_seqlens_unpadded"] = cu_seqlens_unpadded.squeeze()
-
-    skip_keys = {
-        "cu_seqlens",
-        "cu_seqlens_unpadded",
-        "cu_seqlens_argmin",
-        "cu_seqlens_unpadded_argmin",
-        "max_seqlen",
-        "token_count",
-    }
-
-    for key, val in batch.items():
-        if val is None or key in skip_keys:
-            continue
-        index = tex.thd_get_partitioned_indices(cu_seqlens, val.size(1), cp_size, cp_rank)
-        batch[key] = val.index_select(1, index)
-
-    return batch
-
-
-def get_batch_from_iterator(
-    data_iterator: Iterable,
-    use_mtp: bool = False,
-    skip_getting_attention_mask_from_dataset: bool = True,
-    *,
-    is_first_pp_stage: bool,
-    is_last_pp_stage: bool,
-) -> dict[str, torch.Tensor]:
-    """Get a batch of data from the iterator.
-
-    Args:
-        data_iterator: The data iterator to get the batch from.
-        use_mtp: Whether Multi-Token Prediction layers are enabled.
-        skip_getting_attention_mask_from_dataset: If set, the dataset will pass a None attention mask.
-
-    Returns:
-        dict[str, torch.Tensor]: A dictionary containing the batch data.
-    """
-    batch = next(data_iterator)
-
-    required_device_keys = set()
-    required_host_keys = set()
-
-    if not skip_getting_attention_mask_from_dataset:
-        required_device_keys.add("attention_mask")
-
-    if "cu_seqlens" in batch:
-        required_device_keys.add("cu_seqlens")
-        if "cu_seqlens_unpadded" in batch:
-            required_device_keys.add("cu_seqlens_unpadded")
-        required_host_keys.add("cu_seqlens_argmin")
-        required_host_keys.add("max_seqlen")
-        if "cu_seqlens_unpadded_argmin" in batch:
-            required_host_keys.add("cu_seqlens_unpadded_argmin")
-
-    if is_first_pp_stage or use_mtp:
-        required_device_keys.update(("tokens", "position_ids"))
-    if is_last_pp_stage:
-        required_device_keys.update(("labels", "loss_mask"))
-
-    _batch_required_keys = {}
-    for key, val in batch.items():
-        if key in required_device_keys:
-            _batch_required_keys[key] = val.cuda(non_blocking=True) if val is not None else None
-        elif key in required_host_keys:
-            _batch_required_keys[key] = val.cpu() if val is not None else None
-        else:
-            _batch_required_keys[key] = None
-
-    return _batch_required_keys
-
-
-def get_batch(
-    data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False, *, pg_collection
-) -> tuple[
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor | None,
-    torch.Tensor | None,
-]:
-    """Generate a batch.
-
-    Args:
-        data_iterator: Input data iterator
-        cfg: Configuration container
-        use_mtp: Whether Multi-Token Prediction layers are enabled
-
-    Returns:
-        tuple of tensors containing tokens, labels, loss_mask, attention_mask, position_ids,
-        cu_seqlens, cu_seqlens_argmin, max_seqlen, cu_seqlens_unpadded, and
-        cu_seqlens_unpadded_argmin
-    """
-    # Determine pipeline stage role via process group collection
-    is_first = is_pp_first_stage(pg_collection.pp)
-    is_last = is_pp_last_stage(pg_collection.pp)
-    if (not is_first) and (not is_last):
-        return None, None, None, None, None, None, None, None, None, None
-
-    batch = get_batch_from_iterator(
-        data_iterator,
-        use_mtp,
-        getattr(cfg.dataset, "skip_getting_attention_mask_from_dataset", True),
-        is_first_pp_stage=is_first,
-        is_last_pp_stage=is_last,
-    )
-
-    cp_size = pg_collection.cp.size()
-    has_packed = batch.get("cu_seqlens") is not None
-    if has_packed and cp_size > 1:
-        batch = _partition_packed_batch_for_cp(batch, cp_size)
-    else:
-        # slice batch along sequence dimension for context parallelism
-        batch = get_batch_on_this_cp_rank(batch, cp_group=pg_collection.cp)
-
-    return (
-        batch["tokens"],
-        batch["labels"],
-        batch["loss_mask"],
-        batch.get(
-            "attention_mask"
-        ),  # Attention_mask is optional for pre-training as a casual mask is generated automatically.
-        batch["position_ids"],
-        batch.get("cu_seqlens"),
-        batch.get("cu_seqlens_argmin"),
-        batch.get("max_seqlen"),
-        batch.get("cu_seqlens_unpadded"),
-        batch.get("cu_seqlens_unpadded_argmin"),
-    )
-
-
-def _forward_step_common(
-    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Forward training step.
-
-    Args:
-        state: Global state for the run
-        data_iterator: Input data iterator
-        model: The GPT Model
-        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
-
-    Returns:
-        tuple containing the output tensor and loss mask
-    """
-    timers = state.timers
-    straggler_timer = state.straggler_timer
-
-    config = get_model_config(model)
-    pg_collection = get_pg_collection(model)
-    use_mtp = (getattr(config, "mtp_num_layers", None) or 0) > 0
-
-    timers("batch-generator", log_level=2).start()
-    with straggler_timer(bdata=True):
-        (
-            tokens,
-            labels,
-            loss_mask,
-            attention_mask,
-            position_ids,
-            cu_seqlens,
-            cu_seqlens_argmin,
-            max_seqlen,
-            cu_seqlens_unpadded,
-            cu_seqlens_unpadded_argmin,
-        ) = get_batch(data_iterator, state.cfg, use_mtp, pg_collection=pg_collection)
-    timers("batch-generator").stop()
-
-    forward_args = {
-        "input_ids": tokens,
-        "position_ids": position_ids,
-        "attention_mask": attention_mask,
-        "labels": labels,
-    }
-
-    # Add packed sequence support
-    if cu_seqlens is not None:
-        packed_seq_params = {
-            "cu_seqlens": cu_seqlens,
-            "cu_seqlens_argmin": cu_seqlens_argmin,
-            "max_seqlen": max_seqlen,
-            "cu_seqlens_unpadded": cu_seqlens_unpadded,
-            "cu_seqlens_unpadded_argmin": cu_seqlens_unpadded_argmin,
-        }
-        forward_args["packed_seq_params"] = get_packed_seq_params(packed_seq_params)
-
-    with straggler_timer:
-        if return_schedule_plan:
-            assert config.overlap_moe_expert_parallel_comm, (
-                "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan"
-            )
-            schedule_plan = model.build_schedule_plan(
-                tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask
-            )
-            return schedule_plan, loss_mask
-        else:
-            output_tensor = model(**forward_args)
-
-    return output_tensor, loss_mask
-
-
-def forward_step(
-    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
-) -> tuple[torch.Tensor, partial]:
-    """Forward training step.
-
-    Args:
-        state: Global state for the run
-        data_iterator: Input data iterator
-        model: The GPT Model
-        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
-
-    Returns:
-        tuple containing the output tensor and the loss function
-    """
-    output, loss_mask = _forward_step_common(state, data_iterator, model, return_schedule_plan)
-
-    loss_function = _create_loss_function(
-        loss_mask,
-        check_for_nan_in_loss=state.cfg.rerun_state_machine.check_for_nan_in_loss,
-        check_for_spiky_loss=state.cfg.rerun_state_machine.check_for_spiky_loss,
-    )
-
-    return output, loss_function
-
-
-def _create_loss_function(loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial:
-    """Create a partial loss function with the specified configuration.
-
-    Args:
-        loss_mask: Used to mask out some portions of the loss
-        check_for_nan_in_loss: Whether to check for NaN values in the loss
-        check_for_spiky_loss: Whether to check for spiky loss values
-
-    Returns:
-        A partial function that can be called with output_tensor to compute the loss
-    """
-    return partial(
-        masked_next_token_loss,
-        loss_mask,
-        check_for_nan_in_loss=check_for_nan_in_loss,
-        check_for_spiky_loss=check_for_spiky_loss,
-    )
-
-
-def forward_step_modelopt(
-    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
-) -> tuple[torch.Tensor, partial]:
-    """Forward training step with ModelOpt required modifications.
-
-    Args:
-        state: Global state for the run
-        data_iterator: Input data iterator
-        model: The GPT Model
-        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
-
-    Returns:
-        tuple containing the output tensor and the loss function
-    """
-    output, loss_mask = _forward_step_common(state, data_iterator, model, return_schedule_plan)
-
-    loss_function = _create_loss_function_modelopt(
-        loss_mask,
-        model,
-        check_for_nan_in_loss=state.cfg.rerun_state_machine.check_for_nan_in_loss,
-        check_for_spiky_loss=state.cfg.rerun_state_machine.check_for_spiky_loss,
-    )
-
-    return output, loss_function
-
-
-def _create_loss_function_modelopt(
-    loss_mask: torch.Tensor, model: GPTModel, check_for_nan_in_loss: bool, check_for_spiky_loss: bool
-) -> partial:
-    """Create a partial loss function with the specified configuration.
-
-    Kept here for backward compatibility with tests and callers that patch
-    `megatron.bridge.training.gpt_step.masked_next_token_loss`.
-
-    Args:
-        loss_mask: Used to mask out some portions of the loss
-        model: The GPT Model
-        check_for_nan_in_loss: Whether to check for NaN values in the loss
-        check_for_spiky_loss: Whether to check for spiky loss values
-
-    Returns:
-        A partial function that can be called with output_tensor to compute the loss
-    """
-    mnt_loss_func = partial(
-        masked_next_token_loss,
-        loss_mask,
-        check_for_nan_in_loss=check_for_nan_in_loss,
-        check_for_spiky_loss=check_for_spiky_loss,
-    )
-    unwrapped_model = unwrap_model(model)
-    if isinstance(unwrapped_model, mtd.DistillationModel):
-        return partial(loss_func_kd, loss_mask=loss_mask, original_loss_fn=mnt_loss_func, model=unwrapped_model)
-    else:
-        return mnt_loss_func
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/pretrain.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch.distributed as dist
-from nvidia_resiliency_ext.inprocess import CallWrapper
-
-from megatron.bridge.data.utils import get_dataset_provider
-from megatron.bridge.training.callbacks import Callback, CallbackManager, normalize_callbacks
-from megatron.bridge.training.config import ConfigContainer, runtime_config_update
-from megatron.bridge.training.eval import evaluate_and_print_results
-from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
-from megatron.bridge.training.setup import setup
-from megatron.bridge.training.state import GlobalState
-from megatron.bridge.training.train import _finish_train, train
-from megatron.bridge.training.utils.log_utils import barrier_and_log
-from megatron.bridge.utils.common_utils import print_rank_0
-from megatron.bridge.utils.decorators import experimental_fn
-
-
-@experimental_fn
-def pretrain(
-    config: ConfigContainer,
-    forward_step_func: ForwardStepCallable,
-    callbacks: list[Callback] | CallbackManager | None = None,
-) -> None:
-    """Main function to run the training pipeline.
-
-    Sets up the environment, model, optimizer, scheduler, and data iterators.
-    Performs training, validation, and optionally testing based on the provided
-    configuration.
-
-    Args:
-        config: The main configuration container holding all necessary parameters.
-        forward_step_func: A callable (function or functor) that performs a single
-                          forward and backward step, returning the loss and any computed
-                          metrics. Supports the following signatures:
-                          - 2 args: (data_iterator, model)
-                          - 3 args: (data_iterator, model, return_schedule_plan=False)
-                                   OR (state: GlobalState, data_iterator, model)
-                          - 4 args: (state: GlobalState, data_iterator, model, return_schedule_plan=False)
-        callbacks: Optional callbacks for custom logic injection. Can be:
-                   - list[Callback]: List of Callback subclass instances
-                   - CallbackManager: Pre-configured manager with registered callbacks
-                   - None: No callbacks (default)
-
-    Note:
-        Use the signature with GlobalState type hint for full access to configuration, timers, and training state.
-        State injection is automatic based on type hints or parameter names.
-        Functors (classes with __call__) are fully supported.
-
-    Warnings:
-        This is an experimental API and is subject to change in backwards
-        incompatible ways without notice.
-    """
-    # Apply runtime config updates prior to creating/attaching GlobalState
-    runtime_config_update(config)
-
-    # Create a single GlobalState instance regardless of restart path
-    state = GlobalState()
-    state.cfg = config
-
-    # Normalize callbacks to CallbackManager
-    callback_manager = normalize_callbacks(callbacks)
-
-    if config.inprocess_restart and config.inprocess_restart.enabled:
-        if dist.is_initialized():
-            raise RuntimeError(
-                "In-process restart is incompatible with user-initialized process groups. "
-                "The in-process restart mechanism expects to manage the process group lifecycle "
-                "and will destroy it during fault recovery. Either:\n"
-                "1. Disable in-process restart and manage the process group yourself, or\n"
-                "2. Let the framework initialize the process group by not calling "
-                "torch.distributed.init_process_group() before training."
-            )
-
-        # Apply in-process restart wrapper directly to _pretrain
-        from megatron.bridge.training.inprocess_restart import maybe_wrap_for_inprocess_restart
-
-        # Wrap _pretrain directly and get the store; state is captured for abort
-        wrapped_pretrain, store = maybe_wrap_for_inprocess_restart(_pretrain, config.inprocess_restart, state)
-
-        # Execute the wrapped function - nvidia-resiliency-ext will inject inprocess_call_wrapper
-        # Call with positional args matching the adapter signature: (state, forward_step_func, store=None, inprocess_call_wrapper=None)
-        wrapped_pretrain(state, forward_step_func, callback_manager, store=store)
-    else:
-        # Normal execution without in-process restart
-        _pretrain(state=state, forward_step_func=forward_step_func, callback_manager=callback_manager)
-
-
-def _pretrain(
-    state: GlobalState,
-    forward_step_func: ForwardStepCallable,
-    callback_manager: CallbackManager | None = None,
-    store: dist.Store | None = None,
-    inprocess_call_wrapper: CallWrapper | None = None,
-) -> None:
-    """Internal function containing the actual pretrain logic.
-
-    Args:
-        state: Global training state containing the validated configuration and runtime objects
-        forward_step_func: Function or functor that performs a single forward/backward step
-        callback_manager: Optional CallbackManager for custom callback execution
-        store: Optional distributed Store used by in-process restart for coordination
-        inprocess_call_wrapper: Optional wrapper injected by nvrx to expose restart iteration
-    """
-    # Determine whether the training loop will initialize the process group
-    # If the trainer creates the process group, the trainer should destroy it before returning control back to the user
-    should_destroy_process_group = not dist.is_initialized()
-
-    # Handle in-process restart store prefix
-    if inprocess_call_wrapper is not None:
-        restart_attempt = inprocess_call_wrapper.iteration
-        store = dist.PrefixStore(str(restart_attempt), store)
-
-    config = state.cfg
-    dataset_provider = get_dataset_provider(config.dataset)
-    setup_output = setup(state, dataset_provider, restart_store=store, callback_manager=callback_manager)
-    state = setup_output.state
-    model = setup_output.model
-    optimizer = setup_output.optimizer
-    scheduler = setup_output.scheduler
-    train_data_iterator = setup_output.train_data_iterator
-    valid_data_iterator = setup_output.valid_data_iterator
-    test_data_iterator = setup_output.test_data_iterator
-    checkpoint_manager = setup_output.checkpoint_manager
-    pg_collection = setup_output.pg_collection
-
-    # TRAINING
-    if not config.validation.skip_train:
-        if state.train_state.do_train and config.train.train_iters > 0:
-            train(
-                forward_step_func,
-                model,
-                optimizer,
-                scheduler,
-                train_data_iterator,
-                valid_data_iterator,
-                state,
-                checkpoint_manager,
-                pg_collection,
-                callback_manager=callback_manager,
-            )
-
-        barrier_and_log("after training is done")
-
-    else:
-        print_rank_0("skipping training ...")
-
-    iteration = state.train_state.step
-
-    # VALIDATION
-    if state.train_state.do_valid:
-        prefix = f"iteration {iteration} on validation set"
-        evaluate_and_print_results(
-            state,
-            prefix,
-            forward_step_func,
-            valid_data_iterator,
-            model,
-            config.model,
-            verbose=True,
-            write_to_tensorboard=not config.validation.skip_train,
-            callback_manager=callback_manager,
-        )
-    if state.train_state.do_test:
-        prefix = f"iteration {iteration} on test set"
-        evaluate_and_print_results(
-            state,
-            prefix,
-            forward_step_func,
-            test_data_iterator,
-            model,
-            config.model,
-            verbose=True,
-            write_to_tensorboard=not config.validation.skip_train,
-            callback_manager=callback_manager,
-            is_test=True,
-        )
-
-    _finish_train(state, checkpoint_manager)
-    _maybe_destroy_process_group(should_destroy_process_group)
-
-
-def _maybe_destroy_process_group(should_destroy: bool) -> None:
-    """Destroy the process group if it was created by this training session.
-
-    Args:
-        should_destroy: Whether the process group should be destroyed
-    """
-    if should_destroy and dist.is_initialized():
-        dist.barrier()
-        dist.destroy_process_group()
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/pp_comm_overlap.png
-```png
-[Binary file]
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/pruning.md
-```md
-# Pruning
-
-Pruning reduces model size by removing redundant parameters (e.g., shrinking hidden dimensions or layers) while preserving accuracy. In Megatron Bridge, pruning is provided by [NVIDIA Model Optimizer (ModelOpt)](https://github.com/NVIDIA/Model-Optimizer) using the Minitron algorithm for GPT and Mamba-based models loaded from HuggingFace.
-
-## Pre-requisites
-
-Running the pruning example requires Megatron-Bridge and Model-Optimizer dependencies. We recommend using the NeMo container (e.g., `nvcr.io/nvidia/nemo:26.02`). To use the latest ModelOpt scripts, mount your Model-Optimizer repo to the container.
-
-```bash
-export MODELOPT_DIR=${PWD}/Model-Optimizer # or set to your local Model-Optimizer repository path if you have cloned it
-if [ ! -d "${MODELOPT_DIR}" ]; then
-  git clone https://github.com/NVIDIA/Model-Optimizer.git ${MODELOPT_DIR}
-fi
-
-export DOCKER_IMAGE=nvcr.io/nvidia/nemo:26.02
-docker run \
-  --gpus all \
-  --shm-size=20g \
-  --net=host \
-  --ulimit memlock=-1 \
-  --rm -it \
-  -v ${MODELOPT_DIR}:/opt/Model-Optimizer \
-  -v ${MODELOPT_DIR}/modelopt:/opt/venv/lib/python3.12/site-packages/modelopt \
-  -w /opt/Model-Optimizer/examples/megatron_bridge \
-  ${DOCKER_IMAGE} bash
-```
-
-Once inside the container, you need to login with your HuggingFace token to download gated datasets / models.
-Note that the default dataset for pruning is [`nemotron-post-training-dataset-v2`](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2), which is gated.
-
-```bash
-huggingface-cli login --token <your token>
-```
-
-## Usage
-
-### Prune to a target parameter count (using Neural Architecture Search)
-
-Example: prune Qwen3-8B to 6B on 2 GPUs (Pipeline Parallelism = 2), skipping pruning of `num_attention_heads`. Defaults: 1024 samples from [nemotron-post-training-dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) for calibration, at most 20% depth (`num_layers`) and 40% width per prunable hyperparameter (`hidden_size`, `ffn_hidden_size`, ...), top-10 candidates evaluated for MMLU (5% sampled data) to select the best model.
-
-```bash
-torchrun --nproc_per_node 2 prune_minitron.py \
-    --pp_size 2 \
-    --hf_model_name_or_path Qwen/Qwen3-8B \
-    --prune_target_params 6e9 \
-    --hparams_to_skip num_attention_heads \
-    --output_hf_path /tmp/Qwen3-8B-Pruned-6B
-```
-
-### Prune to a specific architecture (using manual configuration)
-
-Example: prune Qwen3-8B to a fixed architecture. Defaults: 1024 samples from [nemotron-post-training-dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) for calibration.
-
-```bash
-torchrun --nproc_per_node 2 prune_minitron.py \
-    --pp_size 2 \
-    --hf_model_name_or_path Qwen/Qwen3-8B \
-    --prune_export_config '{"hidden_size": 3584, "ffn_hidden_size": 9216}' \
-    --output_hf_path /tmp/Qwen3-8B-Pruned-6B-manual
-```
-
-To see the full list of options for advanced configurations, run:
-
-```bash
-torchrun --nproc_per_node 1 prune_minitron.py --help
-```
-
-### Uneven pipeline parallelism
-
-If the number of layers is not divisible by the number of GPUs (pipeline parallel size), set `--num_layers_in_first_pipeline_stage` and `--num_layers_in_last_pipeline_stage`. For example, Qwen3-8B with 36 layers on 8 GPUs: set both to 3 to get 3-5-5-5-5-5-5-3 layers per GPU.
-
-## More information
-
-For more details, see the [ModelOpt pruning README](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge#readme).
-
-## Next steps: Knowledge Distillation
-
-Knowledge Distillation is required to recover the performance of the pruned model. See the [Knowledge Distillation](distillation.md) guide for more details.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/resiliency.md
-```md
-# Resiliency
-
-Megatron Bridge incorporates resilient training features from the
-[NVIDIA Resiliency Extension](https://github.com/NVIDIA/nvidia-resiliency-ext).
-This extension provides fault-tolerant capabilities that help minimize downtime
-due to failures and interruptions during training.
-
-This page is the stable overview for what each resiliency feature is, when to
-use it, and which constraints are durable. For operational setup, config knobs,
-parameter tables, code anchors, and verification commands, see [skills/resiliency/SKILL.md](../skills/resiliency/SKILL.md).
-
-## What It Is
-
-| Feature | Purpose | Maturity | Cluster |
-|---|---|---|---|
-| Fault tolerance | Hang detection + automatic job restart | Production | Slurm only |
-| NVRx straggler detection | Identify slow GPUs | Production | Any |
-| Preemption | Graceful shutdown before time limit | Production | Slurm only |
-| Async checkpoint save | Non-blocking checkpoint writes | Production | Any |
-| Local checkpointing | Fast local save with replication | Production | Any |
-| Re-run state machine | NaN / spiky loss attribution | Experimental | Any |
-| In-process restart | Restart within the same process | Experimental | Any |
-
-## Fault Tolerance
-
-The fault tolerance feature detects hangs during training and automatically
-restarts the workload. It uses section-based monitoring with different timeout
-thresholds for setup, training steps, and checkpointing operations.
-
-### When to Use It
-
-Fault tolerance is a good fit when:
-
-- training on unreliable hardware or at very large scale
-- transient faults (network glitches, GPU errors) are common
-- you want automatic recovery without manual intervention
-
-### Stable Constraints
-
-- Requires Slurm and `ft_launcher` (not `torchrun`)
-- Checkpoint directory must be configured and accessible
-- Uses `nvidia-resiliency-ext` RankMonitorClient
-- Not compatible with NSys profiling
-
-The system supports both in-job restarts (within the same Slurm allocation) and
-new job launches on failure, with configurable limits for each.
-
-## Straggler Detection
-
-NVRx straggler detection monitors GPU performance across ranks and identifies
-slow-performing nodes. It calculates both relative and individual performance
-scores, and can optionally terminate training if performance falls below
-configurable thresholds.
-
-### When to Use It
-
-Straggler detection is useful when:
-
-- training at scale where one slow node degrades overall throughput
-- you want visibility into per-rank GPU performance
-- you need to identify persistent hardware issues
-
-### Stable Constraints
-
-- Requires `nvidia-resiliency-ext`
-- Overhead is minimal but can be tuned via `profiling_interval`
-- Does **not** stop training by default; `stop_if_detected` must be
-  explicitly set to `True` for automatic termination
-
-## Preemption
-
-Preemption handling provides graceful shutdown when a training job receives a
-termination signal (default: SIGTERM). It saves a checkpoint before exiting to
-preserve training progress.
-
-### When to Use It
-
-Preemption is important when:
-
-- running on shared clusters with job time limits
-- higher-priority jobs may preempt your allocation
-- you want to minimize lost work on job termination
-
-### Stable Constraints
-
-- The `PreemptionPlugin` is Slurm-specific
-- Direct configuration via `exit_signal_handler` works on any cluster
-- Signal detection happens at the end of each training step
-
-## Async Checkpoint Save
-
-Async checkpoint save overlaps checkpoint I/O with training compute using
-persistent background workers. Training continues immediately after scheduling
-the save rather than blocking until the write completes.
-
-### When to Use It
-
-Async save is valuable when:
-
-- checkpoint save time is a significant fraction of step time
-- you are using `torch_dist` checkpoint format
-
-### Stable Constraints
-
-- Requires `ckpt_format="torch_dist"`
-- Other formats (zarr, fsdp_dtensor) do not support async save
-- The persistent checkpoint worker must be enabled
-
-## Local Checkpointing
-
-Local checkpointing saves checkpoint data to node-local storage first, then
-replicates across a configurable number of nodes. This avoids the latency of
-writing to shared network storage during the critical path.
-
-### When to Use It
-
-Local checkpointing is useful when:
-
-- shared-storage checkpoint writes are the bottleneck in your checkpoint interval
-- you want faster recovery from node failures without depending on network filesystem availability
-- training at scale where network-storage contention is common
-
-### Stable Constraints
-
-- Node-local storage must have sufficient capacity for at least one checkpoint
-- Replication degree must be configured to survive the expected failure rate
-- Requires compatible checkpoint format (see [skills/resiliency/SKILL.md](../skills/resiliency/SKILL.md))
-
-## Re-run State Machine
-
-The re-run state machine is an experimental feature for attributing unexpected
-results (NaN loss, spiky loss) to transient errors, persistent hardware faults,
-or correct-but-unexpected results. It works by re-running computations on the
-same and different GPUs.
-
-### When to Use It
-
-Consider the re-run state machine when:
-
-- you need automated NaN detection and attribution
-- you want to distinguish hardware faults from training instability
-
-### Stable Constraints
-
-- Alpha-level feature; full integration is limited
-- Three modes: `disabled`, `validate_results`, `report_determinism_stats`
-- Uses specific exit codes (16, 17) to control job behavior
-
-## In-Process Restart
-
-In-process restart provides automatic fault recovery by restarting the training
-function within the same OS process. This avoids the overhead of launching new
-jobs, starting containers, and creating new CUDA contexts.
-
-### When to Use It
-
-In-process restart is appropriate when:
-
-- software faults (exceptions, deadlocks) are more common than hardware faults
-- restart latency matters and you want to avoid full job relaunch
-- you can accept the experimental status and compatibility constraints
-
-### Stable Constraints
-
-- Requires PyTorch >= 2.5.1 and NCCL >= 2.26.2
-- Not compatible with NeMo-Run or Slurm preemption plugins
-- Requires specific environment variables (`NCCL_NVLS_ENABLE=0`, etc.)
-- The PyTorch NCCL watchdog timeout must exceed `hard_timeout`
-- Supports both node-level and rank-level restart granularity
-
-In-process restart is not suitable for hardware-level failures such as switch
-failures or network partitions. For comprehensive fault tolerance, combine it
-with job-level fault tolerance.
-
-## Practical Caveats
-
-1. No single resiliency feature covers all failure modes. The recommended
-   approach is to layer features (e.g., fault tolerance + straggler detection +
-   async checkpoint).
-2. Not all recipes enable resiliency features by default. Check and enable
-   explicitly.
-3. Two straggler detectors exist in the codebase (NVRx and legacy MCore).
-   Use the NVRx version; do not enable both.
-
-## Related Docs
-
-- [docs/training/checkpointing.md](checkpointing.md)
-- [docs/performance-guide.md](../performance-guide.md)
-- [skills/resiliency/SKILL.md](../skills/resiliency/SKILL.md)
-- [skills/resiliency/card.yaml](../skills/resiliency/card.yaml)
-- [NVIDIA Resiliency Extension](https://github.com/NVIDIA/nvidia-resiliency-ext)
-- [In-Process Restart Guide](https://nvidia.github.io/nvidia-resiliency-ext/inprocess/index.html)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/activation-recomputation.md
-```md
-# Activation Recomputation
-
-The input activations of network layers are stored in device memory and are used to compute gradients during back-propagation. When training a LLM with a long sequence length or a large micro-batch size, these input activations can quickly saturate device memory. Checkpointing a few activations and recomputing the rest is a common technique to reduce device memory usage.
-
-Activation recomputation in Megatron Bridge is configured through the model provider's recomputation parameters, which are based on Megatron Core's `TransformerConfig`.
-
-## Transformer Layer Recomputation
-
-Megatron Bridge supports transformer layer recomputation, which checkpoints the input of each transformer layer and recomputes the activations for the remaining layers. This technique significantly reduces activation memory usage. However, it increases the per-transformer layer computation cost by 30% due to re-executing the entire layer's forward computation.
-
-Megatron Bridge also supports partial transformer layer recomputation, which is beneficial when recomputing a few transformer layers helps to reduce enough GPU memory for the model to fit. This approach avoids the need to recompute the rest of the layers.
-
-### Configuration
-
-Transformer layer recomputation is configured through the model provider's recomputation parameters:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Full recomputation - recompute all layers
-model_config = GPTModelProvider(
-    recompute_granularity="full",  # Enable full layer recomputation
-    recompute_method="uniform",    # Uniform distribution across layers
-    recompute_num_layers=4,        # Number of layers per recomputation block
-    # ... other model parameters
-)
-```
-
-### Recomputation Methods
-
-#### Block Method
-Recomputes a specific number of transformer layers per pipeline stage:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="full",
-    recompute_method="block",      # Block-wise recomputation
-    recompute_num_layers=4,        # Recompute 4 layers per pipeline stage
-)
-```
-
-#### Uniform Method
-Uniformly divides the total number of transformer layers and recomputes input activations for each divided chunk:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="full",
-    recompute_method="uniform",    # Uniform distribution
-    recompute_num_layers=8,        # Number of layers per recomputation block
-)
-```
-
-### Pipeline Parallelism Considerations
-
-When training with pipeline parallelism:
-- `recompute_num_layers` indicates the layers per pipeline stage
-- When using virtual pipelining, `recompute_num_layers` specifies the number of layers per virtual pipeline stage
-- The framework automatically handles recomputation coordination across pipeline stages
-
-![Activation Recomputation Methods](images/activation-recomputation-example-1.jpg)
-*Figure 1: Scheme of uniform and block checkpointing method (full checkpointing granularity)*
-
-## Self-attention Recomputation
-
-Megatron Bridge supports selective self-attention recomputation that checkpoints the inputs of each self-attention block and recomputes the intermediate input activations. This cost-efficient method achieves high memory savings with minimal recomputation cost.
-
-The intermediate layers of the self-attention block account for the majority of the activation memory because the input sizes of softmax, dropout, and QKV dot-product attention layers have memory complexity proportional to the sequence length squared. However, their recomputation cost is relatively smaller than other linear projection layers that scale with the hidden size squared.
-
-![Activation Recomputation Granularity](images/activation-recomputation-example-2.jpg)
-*Figure 2: Scheme of full and selective checkpointing granularity*
-
-### Configuration
-
-Self-attention recomputation is enabled using selective granularity:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-model_config = GPTModelProvider(
-    recompute_granularity="selective",  # Enable selective recomputation
-    recompute_modules=["core_attn"],    # Recompute attention modules (default)
-    # ... other model parameters
-)
-```
-
-### Recomputation Modules
-
-Megatron Bridge supports selective recomputation for various modules:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="selective",
-    recompute_modules=[
-        "core_attn",      # Core attention computation (default)
-        "mlp",            # MLP layers
-        "layernorm",      # Layer normalization
-        "moe",            # Mixture of Experts layers
-        "moe_act",        # MoE activation functions
-        "shared_experts", # Shared expert layers
-        "mla_up_proj",    # Multi-Latent Attention up projection
-    ],
-)
-```
-
-### Flash Attention Integration
-
-Self-attention recomputation is automatically enabled when using Flash Attention through Transformer Engine. Flash Attention inherently provides memory efficiency by recomputing attention scores rather than storing them, making additional explicit recomputation often unnecessary.
-
-## Advanced Recomputation Configuration
-
-### Distributed Activation Checkpointing
-
-For models using model parallelism, you can distribute saved activations across the model parallel group:
-
-```python
-model_config = GPTModelProvider(
-    recompute_granularity="selective",
-    distribute_saved_activations=True,  # Distribute across model parallel group
-    # Note: Cannot be used with sequence_parallel=True
-)
-```
-
-### Memory vs Computation Trade-offs
-
-Different recomputation strategies offer different memory-computation trade-offs:
-
-- **Selective recomputation**: Provides high memory savings with minimal recomputation cost by targeting memory-intensive operations like attention
-- **Full recomputation**: Significantly reduces activation memory usage but increases per-transformer layer computation cost by approximately 30%
-- **No recomputation**: Preserves all activations in memory, requiring more GPU memory but no additional computation
-
-### MoE-Specific Recomputation
-
-For Mixture of Experts models, specialized recomputation options are available:
-
-```python
-model_config = GPTModelProvider(
-    # MoE configuration
-    num_moe_experts=8,
-    expert_model_parallel_size=2,
-    
-    # MoE recomputation
-    recompute_granularity="selective",
-    recompute_modules=["moe", "moe_act"],  # Recompute MoE-specific modules
-)
-```
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/config-container-overview.md
-```md
-# Configuration Overview
-
-The `ConfigContainer` is the central configuration object in Megatron Bridge that holds all settings for training. It acts as a single source of truth that brings together model architecture, training parameters, data loading, optimization, checkpointing, logging, and distributed training settings.
-
-## What is ConfigContainer
-
-`ConfigContainer` is a dataclass that holds all the configuration objects needed for training:
-
-```python
-from megatron.bridge.training.config import ConfigContainer
-
-# ConfigContainer brings together all training configurations
-config = ConfigContainer(
-    model=model_provider,             # Model architecture and parallelism
-    train=training_config,            # Training loop parameters  
-    optimizer=optimizer_config,       # Megatron Optimization settings
-    scheduler=scheduler_config,       # Learning rate scheduling
-    dataset=dataset_config,           # Data loading configuration
-    logger=logger_config,             # Logging and monitoring
-    tokenizer=tokenizer_config,       # Tokenization settings
-    checkpoint=checkpoint_config,     # Checkpointing and resuming
-    dist=distributed_config,          # Distributed training setup
-    ddp=ddp_config,                   # Megatron Distributed Data Parallel settings
-    # Optional configurations
-    peft=peft_config,                 # Parameter-efficient fine-tuning
-    profiling=profiling_config,       # Performance profiling
-    mixed_precision=mp_config,        # Mixed precision training
-    comm_overlap=comm_overlap_config, # Communication overlap settings
-    # ... and more
-)
-```
-
-## Configuration Components
-
-| Component | Purpose | Required | Default |
-|-----------|---------|----------|---------|
-| `model` | Model architecture and parallelism strategy (GPT, T5, Mamba) | ✅ | - |
-| `train` | Training loop parameters (batch sizes, iterations, validation) | ✅ | - |
-| `optimizer` | Optimizer type and hyperparameters (from Megatron Core) | ✅ | - |
-| `scheduler` | Learning rate and weight decay scheduling | ✅ | - |
-| `dataset` | Data loading and preprocessing configuration | ✅ | - |
-| `logger` | Logging, TensorBoard, and WandB configuration | ✅ | - |
-| `tokenizer` | Tokenizer settings and vocabulary | ✅ | - |
-| `checkpoint` | Checkpointing, saving, and loading | ✅ | - |
-| `dist` | Distributed training initialization | | `DistributedInitConfig()` |
-| `ddp` | Data parallel configuration (from Megatron Core) | | `DistributedDataParallelConfig()` |
-| `rng` | Random number generation settings | | `RNGConfig()` |
-| `rerun_state_machine` | Result validation and error injection | | `RerunStateMachineConfig()` |
-| `mixed_precision` | Mixed precision training settings | | `None` |
-| `comm_overlap` | Communication overlap optimizations | | `None` |
-| `peft` | Parameter-efficient fine-tuning (LoRA, DoRA, etc.) | | `None` |
-| `profiling` | Performance profiling with nsys or PyTorch profiler | | `None` |
-| `ft` | Fault tolerance and automatic recovery | | `None` |
-| `straggler` | GPU straggler detection | | `None` |
-| `nvrx_straggler` | NVIDIA Resiliency Extension straggler detection | | `None` |
-| `inprocess_restart` | In-process restart for fault tolerance | | `None` |
-
-## Design Philosophy
-
-### **Interoperability with External Config Systems**
-
-Megatron Bridge's Python configurations are designed to be amenable to other configuration systems you already use, such as:
-
-- Programmatic configuration: Direct Python object manipulation
-- argparse: Command-line arguments can be easily mapped to dataclass fields
-- File-based overrides: JSON, YAML, or other config files can override Python configs
-
-All of these approaches can be translated into Python dataclass instances. The framework provides utilities as a convenience for YAML-based overrides with OmegaConf, but the framework is not tied to any particular configuration system.
-
-```python
-# All of these approaches work seamlessly:
-
-# 1. Direct Python configuration
-config = ConfigContainer(
-    model=GPTModelProvider(num_layers=24, hidden_size=2048),
-    train=TrainingConfig(global_batch_size=256, train_iters=10000),
-    # ... other configs
-)
-
-# 2. YAML-based serialization and deserialization (round-trip)
-config.to_yaml("my_config.yaml")
-config = ConfigContainer.from_yaml("my_config.yaml")  # Load previously saved config
-
-# 3. Programmatic override after creation
-config.train.global_batch_size = 512  # Override after instantiation
-config.model.num_layers = 48          # Modify model architecture
-```
-
-### Centralized Configuration
-
-Megatron provides extensive flexibility through a rich set of configuration options. The `ConfigContainer` brings all these settings together in a single, organized object. This centralization makes configuration discoverable and maintainable - you have one place to understand and control all aspects of your training run.
-
-Unlike pure YAML-based configuration systems, `ConfigContainer` provides centralization with the full power of Python. You get the organizational benefits of a single configuration file combined with the programmatic flexibility of Python.
-
-The configuration system is built using nested dataclasses, providing:
-
-- **Modularity**: Each config component is independently defined and testable
-- **Type safety**: Full static type checking
-- **IDE support**: Autocomplete and type hints in development environments  
-- **Serialization**: Easy conversion to/from YAML, JSON, or other formats
-- **Validation**: Built-in field validation
-
-```python
-@dataclass
-class ConfigContainer:
-    model: GPTModelProvider      # Dataclass for model architecture
-    train: TrainingConfig        # Dataclass for training parameters
-    optimizer: OptimizerConfig   # Dataclass for optimization settings
-    # ... nested dataclasses for each concern
-```
-
-### Lazy Configuration and Deferred Validation
-
-For training workloads, configurations are lazy to support flexible user workflows:
-
-**Problem with Eager Validation:**
-```python
-# This would be problematic with eager validation:
-config = TrainingConfig(train_iters=1000)
-# __post_init__ calculates dependent values immediately
-
-config.train_iters = 5000  # User override
-# Dependent values are now stale and incorrect!
-```
-
-**Solution with Lazy Finalization:**
-```python
-# Megatron Bridge approach - deferred validation
-config = TrainingConfig(train_iters=1000)
-config.train_iters = 5000  # User can safely override
-
-# Validation happens automatically right when training starts
-pretrain(config, forward_step_func)  # All dependent values calculated correctly
-```
-
-**Benefits:**
-- Users can instantiate configs and subsequently override fields safely
-- Dependent values are calculated correctly after all user modifications are applied
-- Validation happens at the right time, right before training begins
-- Flexible configuration workflows are supported
-
-### **Model Independence**
-
-Model configurations are designed to be independently usable outside the full training loop provided by thr framework:
-
-```python
-# Models can be used standalone
-model_provider = GPTModelProvider(
-    num_layers=24,
-    hidden_size=2048,
-    vocab_size=50000,    # Must be explicitly set
-    seq_length=2048,     # Must be explicitly set
-)
-
-# This works independently of other configs
-model_provider.finalize()
-model = model_provider.provide()
-```
-
-**Trade-off**: The price for this flexibility is the need to explicitly set values like `seq_length` in multiple places during training. These settings are checked for consistency at the beginning of training.
-
-## Usage
-
-```python
-# Create and configure
-config = ConfigContainer(
-    model=GPTModelProvider(num_layers=24, seq_length=2048),
-    train=TrainingConfig(train_iters=1000),
-    dataset=GPTDatasetConfig(seq_length=2048),  # Must match model seq_length
-    # ... other required configs
-)
-
-# Modify as needed
-config.train.train_iters = 5000
-config.model.hidden_size = 4096
-
-# Start training - validation happens automatically
-pretrain(config, forward_step_func)
-```
-
-## Configuration Export and Import
-
-### Export to YAML
-```python
-# Print YAML configuration to console
-config.print_yaml()
-
-# Save to file
-config.to_yaml("config.yaml")
-```
-
-### Load from YAML
-```python
-# Load configuration from YAML file
-config = ConfigContainer.from_yaml("config.yaml")
-```
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/multi-token-prediction.md
-```md
-# Multi-Token Prediction (MTP)
-
-## Overview
-
-Multi-Token Prediction (MTP) is an advanced training technique introduced in the [DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) that enables models to predict multiple future tokens simultaneously during pre-training. Instead of learning to predict only the next token at each position, MTP adds auxiliary prediction heads that predict tokens 2, 3, or more positions ahead.
-
-### Key Benefits
-
-- **Densified Training Signals**: Multiple learning signals per training iteration improve data efficiency
-- **Pre-Planning Representations**: Models learn internal representations that encode information about future tokens
-- **Speculative Decoding Foundation**: MTP-trained models can serve as foundation for faster inference via speculative decoding
-
-### When to Use MTP
-
-MTP is most beneficial for:
-
-- **Large-scale pre-training** (models > 10B parameters)
-- **Data-constrained scenarios** where maximizing learning from limited data is critical
-- **Training foundation models** intended for downstream fine-tuning or speculative decoding
-
-MTP is primarily used for pre-training.
-
-### Additional Resources
-
-- [DeepSeek-V3 Technical Report](https://arxiv.org/abs/2412.19437) - Original paper introducing MTP
-- [DeepSeek-V3 GitHub](https://github.com/deepseek-ai/DeepSeek-V3) - Official implementation
-- [Megatron Core MTP Guide](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/user-guide/features/multi_token_prediction.md) - Low-level implementation details
-
-## Configuration Parameters
-
-MTP is controlled by two primary parameters:
-
-| Parameter | Type | Default | Description | Typical Range |
-|-----------|------|---------|-------------|---------------|
-| `mtp_num_layers` | int | `None` (disabled) | Number of auxiliary prediction depths. Each layer predicts tokens N positions ahead (N=1,2,...,mtp_num_layers). | 1-2 |
-| `mtp_loss_scaling_factor` | float | `0.1` | Weight applied to MTP losses relative to main next-token loss. Controls the contribution of auxiliary predictions to the total loss. | 0.05-0.2 |
-
-### Loss Calculation
-
-The total training loss combines the main next-token prediction loss with averaged MTP losses:
-
-```
-total_loss = main_loss + (avg_mtp_loss * mtp_loss_scaling_factor)
-
-where:
-  avg_mtp_loss = mean([mtp_1_loss, mtp_2_loss, ..., mtp_N_loss])
-```
-
-### Parameter Tuning Guidelines
-
-**`mtp_num_layers`:**
-- Start with `1` for most models (predicts 1 token ahead)
-- Use `2` for models > 100B parameters if memory allows
-- Higher values increase memory usage and training time proportionally
-
-**`mtp_loss_scaling_factor`:**
-- Default `0.1` works well for most models
-- Increase to `0.15-0.2` if MTP losses aren't decreasing
-- Decrease to `0.05-0.08` if main loss is being overshadowed
-- Scale factor should be proportional to `mtp_num_layers` (more layers → lower factor)
-
-## Basic Usage: Training from Scratch
-
-### Minimal Configuration Example
-
-Here's a minimal example using the Qwen3 30B-A3B recipe with MTP enabled:
-
-```python
-from megatron.bridge.recipes.qwen.qwen3_moe import qwen3_30b_a3b_pretrain_config
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.config import ConfigContainer
-
-log_dir = "/path/to/log/dir"
-cfg: ConfigContainer = qwen3_30b_a3b_pretrain_config()
-cfg.logger.tensorboard_dir = log_dir + "/tb_logs"
-cfg.checkpoint.save = log_dir + "/checkpoints"
-cfg.checkpoint.load = log_dir + "/checkpoints"
-# Set up training dataset
-cfg.dataset.blend=[[
-    f"/path/to/dclm/preprocessed/dclm_{i:02d}_text_document"
-    for i in range(1, 11)
-], None]
-cfg.dataset.split="9999,8,2"
-cfg.dataset.path_to_cache = "/path/to/cache"
-# cfg.model.num_layers = 8  # train a smaller model if OOM
-# MTP Configuration
-cfg.model.mtp_num_layers = 1
-cfg.model.mtp_loss_scaling_factor = 0.1
-pretrain(cfg, forward_step)
-```
-Follow the [DCLM Tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/data/dclm) to prepare the training data 
-
-
-## MTP with Pipeline Parallelism
-
-When using Pipeline Parallelism (PP), **MTP layers must be placed in the last pipeline stage** alongside the loss computation layer. Configure this using custom pipeline layout settings (`pipeline_model_parallel_split_rank`).
-
-### Pipeline Layout Guidelines
-
-MTP layers take approximately the same training time as a regular transformer layer. When configuring your pipeline layout:
-
-- **Place MTP in the last PP stage** (required for correct loss computation)
-- **Reduce layers in other PP ranks** to balance computation time across stages
-- Example: For a 21-layer model with PP=4 and `mtp_num_layers=1`, you might use splits like `[5, 6, 6, 4]` instead of `[5, 5, 5, 6]` to account for MTP overhead in the last stage
-
-
-## Parallelism Support
-
-MTP is compatible with all major parallelism strategies in Megatron-Bridge:
-
-| Parallelism Type | Support Status | Notes |
-|------------------|----------------|-------|
-| **Tensor Parallelism (TP)** | ✅ Fully Supported | MTP layers are automatically sharded across TP ranks |
-| **Pipeline Parallelism (PP)** | ✅ Supported with Constraint | MTP must be in last pipeline stage (see above) |
-| **Expert Parallelism (EP)** | ✅ Fully Supported | Works with MoE models (DeepSeek-V3, Mixtral, etc.) |
-| **Context Parallelism (CP)** | ✅ Fully Supported | MTP supports long-context training via CP |
-| **Data Parallelism (DP)** | ✅ Fully Supported | Standard data parallelism works transparently |
-
-## Monitoring MTP Training
-
-### Per-Layer Loss Logging
-
-During training, you'll see losses for each MTP depth logged separately:
-
-```
-iteration      100/  300000 | consumed samples:         3200 | elapsed time per iteration (ms): 3738.6 | learning rate: 6.000000E-05 | global batch size:    32 | lm loss: 7.968678E+00 | load_balancing_loss: 1.329517E+00 | mtp_1 loss: 7.925096E+00 | loss scale: 1.0 | grad norm: 1.040 | number of skipped iterations:   0 | number of nan iterations:   0 |
-```
-
-### Interpreting Loss Values
-
-![MTP Loss Curves](../images/mtp_loss.png)
-
-The figure above shows typical training curves for MTP-enabled training:
-- **Left**: MTP auxiliary loss (`mtp_1 loss`) tracking the first additional token prediction
-- **Right**: Main language model loss (`lm loss`) for standard next-token prediction
-
-**Expected Patterns:**
-
-- **MTP losses are higher than main loss**: Predicting tokens further in the future is inherently harder. In the example above, `mtp_1 loss` (~4.3) is higher than `lm loss` (~3.9) at 3500 iterations.
-
-- **All losses decrease over training**: Both main and MTP losses should trend downward, as shown in the curves above.
-
-- **Loss gap remains relatively stable**: The difference between main and MTP losses should not grow significantly over training.
-
-**Red Flags:**
-
-- **NaN values**: Indicates training instability (see Troubleshooting section)
-- **Diverging losses**: If MTP losses increase while main loss decreases, reduce `mtp_loss_scaling_factor`
-- **Widening gap**: If MTP losses fall behind by > 1.0, increase `mtp_loss_scaling_factor`
-
-**MTP vs Non-MTP Comparison:**
-
-![MTP Loss Comparison](../images/mtp_loss_comparison.png)
-
-The figure above compares `lm loss` between MTP-enabled (blue) and non-MTP (red) training runs on Qwen3-30B-A3B. The curves do not differ significantly in the first few thousand iterations. Notably, the MTP-enabled run shows smoother behavior around iterations 1000 and 2300, where the non-MTP run exhibits more pronounced spikes.
-
-### TensorBoard/WandB Visualization
-
-MTP losses are automatically logged to TensorBoard and/or WandB. Look for:
-
-- `lm loss` - Main next-token prediction loss
-- `mtp_1 loss` - First auxiliary prediction loss
-- `mtp_2 loss` - Second auxiliary prediction loss (if `mtp_num_layers=2`)
-
-### Training Characteristics
-
-- MTP adds computational overhead due to additional forward passes
-- Memory usage increases proportionally to `mtp_num_layers`
-- MTP is designed to improve data efficiency during pre-training
-
-**Model Performance:**
-
-- MTP provides additional training signals at each token position
-- Can potentially improve downstream task performance
-- MTP-trained models can be used for speculative decoding during inference
-
-## Current Limitations
-
-The following features are not yet supported with MTP:
-
-| Feature | Status | Workaround |
-|---------|--------|------------|
-| **HuggingFace ↔ Megatron Checkpoint Conversion** | ⚠️ Model-specific | Conversion support varies by model; check model-specific documentation |
-| **Sequence Packing (Fine-Tuning)** | ❌ Not supported | For pre-training, no issues. For fine-tuning, set `packed_sequence_specs=None` |
-| **Cross-Attention** | ❌ Not supported | MTP only works with decoder-only models (GPT, Llama, etc.) |
-| **Learned Absolute Position Embeddings** | ❌ Not supported | Use RoPE (rotary position embeddings) or no position embeddings |
-| **Block-Based Activation Recomputation** | ❌ Not supported | Use `recompute_granularity="selective"` or `"uniform"` |
-
-### Important Notes
-
-**Checkpoint Conversion:**
-
-HuggingFace ↔ Megatron checkpoint conversion with MTP is model-specific. Some models have conversion support planned, while others may not support MTP parameter mapping. Check the documentation for your specific model.
-
-**Sequence Packing:**
-
-MTP is incompatible with fine-tuning sequence packing (e.g., SFT with packed sequences). For pre-training, there are no sequence packing restrictions.
-
-## Troubleshooting Guide
-
-### Error: Out of Memory (OOM)
-
-MTP increases memory usage proportionally to `mtp_num_layers`. Try:
-- Reduce `mtp_num_layers` to 1
-- Enable activation recomputation: `recompute_granularity="selective"`
-- Increase pipeline parallelism
-- Reduce micro batch size
-
-### Error: MTP Loss is NaN
-
-Training instability. Try:
-- Lower learning rate
-- Enable gradient clipping: `clip_grad=1.0`
-- Use BF16 instead of FP16
-- Reduce `mtp_loss_scaling_factor` to 0.05
-
-### Expected Log: `MTP layers not found on this PP rank`
-
-This is normal. Only the last pipeline stage builds MTP layers.
-
-## Additional Resources
-
-### Code Examples
-
-- **DeepSeek-V3 Recipe**: [`src/megatron/bridge/recipes/deepseek/deepseek_v3.py`](../../src/megatron/bridge/recipes/deepseek/deepseek_v3.py)
-  - Example of MTP with large-scale MoE model
-  - Predefined pipeline layouts for PP + MTP
-
-- **Qwen3-Next Recipe**: [`src/megatron/bridge/recipes/qwen/qwen3_next.py`](../../src/megatron/bridge/recipes/qwen/qwen3_next.py)
-  - Clean example of MTP configuration for dense models
-  - Good starting point for custom recipes
-
-- **MTP Core Implementation**: [`3rdparty/Megatron-LM/megatron/core/transformer/multi_token_prediction.py`](../../3rdparty/Megatron-LM/megatron/core/transformer/multi_token_prediction.py)
-  - Low-level MTP layer implementation
-  - Loss computation and logging helpers
-
-### Documentation
-
-- **Megatron Core MTP Guide**: [`3rdparty/Megatron-LM/docs/user-guide/features/multi_token_prediction.md`](https://github.com/NVIDIA/Megatron-LM/blob/main/docs/user-guide/features/multi_token_prediction.md)
-  - Implementation notes and design decisions
-
-- **Pipeline Parallelism Guide**: [`docs/parallelisms.md`](../parallelisms.md)
-  - Understanding pipeline parallelism layouts
-  - Best practices for PP configuration
-
-### External Resources
-
-- **DeepSeek-V3 Technical Report**: [https://arxiv.org/abs/2412.19437](https://arxiv.org/abs/2412.19437)
-  - Original paper introducing MTP
-  - Section 3.2: "Multi-Token Prediction"
-  - Training details and ablation studies
-
-- **DeepSeek-V3 GitHub**: [https://github.com/deepseek-ai/DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3)
-  - Official implementation and model weights
-  - Training configurations and hyperparameters
-
-- **Megatron-LM GitHub**: [https://github.com/NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
-  - Upstream Megatron-Core implementation
-  - Issues and discussions
-
-### Getting Help
-
-If you encounter issues not covered in this guide:
-
-1. Check the [Megatron-Bridge GitHub Issues](https://github.com/NVIDIA-NeMo/Megatron-Bridge/issues)
-2. Review the [Megatron-LM Discussions](https://github.com/NVIDIA/Megatron-LM/discussions)
-
-When reporting issues, include:
-- Full training configuration (recipe and parameters)
-- Error messages and stack traces
-- GPU type and count
-- Megatron-Core version (`pip show megatron-core`)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/examples/models/nemotron_3/super/pretrain_nemotron_3_super.py
-```py
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import os
-import sys
-from typing import Tuple
-
-import torch
-from omegaconf import OmegaConf
-
-from megatron.bridge.recipes.nemotronh.nemotron_3_super import (
-    nemotron_3_super_pretrain_config as pretrain_config,
-)
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-    parse_hydra_overrides,
-)
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-def parse_cli_args() -> Tuple[argparse.Namespace, list[str]]:
-    """Parse command line arguments, separating known script args from OmegaConf overrides."""
-    parser = argparse.ArgumentParser(
-        description="Pretrain Nemotron 3 Super model using Megatron-Bridge with YAML and CLI overrides",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--config-file",
-        type=str,
-        help="Path to the YAML OmegaConf override file. Default: conf/llama3_8b_pretrain_override_example.yaml",
-    )
-
-    # Parse known args for the script, remaining will be treated as overrides
-    args, cli_dotlist_overrides = parser.parse_known_args()
-    return args, cli_dotlist_overrides
-
-
-def main() -> None:
-    """
-    Entry point for the Nemotron 3 Super pretraining script.
-    """
-    args, cli_overrides = parse_cli_args()
-
-    cfg: ConfigContainer = pretrain_config()
-
-    # Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-    merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-    # Load and merge YAML overrides if a config file is provided
-    if args.config_file:
-        logger.debug(f"Loading YAML overrides from: {args.config_file}")
-        if not os.path.exists(args.config_file):
-            logger.error(f"Override YAML file not found: {args.config_file}")
-            sys.exit(1)
-        yaml_overrides_omega = OmegaConf.load(args.config_file)
-        merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
-        logger.debug("YAML overrides merged successfully.")
-
-    # Apply command-line overrides using Hydra-style parsing
-    if cli_overrides:
-        logger.debug(f"Applying Hydra-style command-line overrides: {cli_overrides}")
-        merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
-        logger.debug("Hydra-style command-line overrides applied successfully.")
-
-    # Apply the final merged OmegaConf configuration back to the original ConfigContainer
-    logger.debug("Applying final merged configuration back to Python ConfigContainer...")
-    final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-    # Apply overrides while preserving excluded fields
-    apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-
-    # Start training
-    logger.debug("Starting pretraining...")
-    pretrain(config=cfg, forward_step_func=forward_step)
-
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
-
-
-if __name__ == "__main__":
-    main()
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/entry-points.md
-```md
-# Training Entry Points
-
-Megatron Bridge provides unified training entry points for pretraining, Supervised Fine-Tuning (SFT), and Parameter-Efficient Fine-Tuning (PEFT). All training modes share the same underlying training loop architecture, differing primarily in their data handling and model configuration.
-
-## Main Entry Points
-
-The {py:func}`bridge.training.pretrain.pretrain` and {py:func}`bridge.training.finetune.finetune` functions are the primary entry points for pretraining models—either from scratch or through fine-tuning. Each function accepts a {py:class}`bridge.training.config.ConfigContainer` along with a `forward_step_func` that defines how the training loop should be run.
-
-
-## Forward Step Function
-
-The `forward_step_func` defines how each training step is executed. It should follow this signature:
-
-```python
-def forward_step_func(
-    global_state: GlobalState,
-    data_iterator: Iterable,
-    model: MegatronModule,
-    return_schedule_plan: bool = False,
-) -> tuple[Any, Callable]:
-    """Forward step function.
-    
-    Args:
-        global_state: Training state object containing configuration and utilities
-        data_iterator: Iterator over training/evaluation data
-        model: The model to perform forward step on
-        return_schedule_plan: Whether to return schedule plan (for MoE overlap)
-        
-    Returns:
-        tuple containing:
-        - output: Forward pass output (tensor or collection of tensors)
-        - loss_func: Function to compute loss from the output
-    """
-```
-
-### Responsibilities
-
-The forward step function has three main responsibilities:
-
-1. **Get a Batch**: Retrieve and process the next batch from the data iterator.
-2. **Run Forward Pass**: Execute the model's forward pass on the batch.
-3. **Return Loss Function**: Provide a function to compute loss from the output.
-
-### State Access
-
-Megatron Bridge automatically provides the {py:class}`bridge.training.state.GlobalState` object containing:
-- **Configuration**: Complete training configuration (`global_state.cfg`).
-- **Timers**: Performance monitoring utilities (`global_state.timers`).
-- **Training Progress**: Current step, consumed samples (`global_state.train_state`).
-- **Loggers**: TensorBoard and WandB loggers for metrics tracking.
-
-All configuration and state information are accessible through the injected `state` object.
-
-For complete implementation examples, see {py:func}`bridge.training.gpt_step.forward_step`.
-
-## Loss Calculation and Reduction
-
-The loss function returned by the forward step can follow different patterns based on your needs:
-
-### Loss Function Patterns
-
-1. **Standard Pattern**: Return `(loss, metadata_dict)`
-   - The loss is automatically averaged across microbatches
-   - Metadata dict contains named loss components for logging
-   - Most common pattern for standard training
-
-2. **Token-aware Pattern**: Return `(loss, num_tokens, metadata_dict)`
-   - Loss is averaged across both microbatches and tokens
-   - Useful when you want per-token loss averaging
-   - Recommended for variable-length sequences
-
-3. **Inference Pattern**: Return arbitrary data structures
-   - Used with `collect_non_loss_data=True` and `forward_only=True`
-   - Suitable for inference, evaluation metrics, or custom data collection
-   - No automatic loss processing applied
-
-### Automatic Loss Processing
-
-The training loop automatically handles:
-- **Microbatch Reduction**: Aggregates losses across all microbatches in the global batch.
-- **Distributed Reduction**: Performs all-reduce operations across data parallel ranks.
-- **Pipeline Coordination**: Only the last pipeline stage computes and reduces losses.
-- **Logging Integration**: Automatically logs loss components to TensorBoard/WandB.
-
-For implementation details, see {py:func}`bridge.training.train.train_step` and {py:func}`bridge.training.losses.masked_token_loss`, as an example.
-
-## Customization
-
-### When to Customize
-
-You can customize the forward step function when you need:
-
-- **Custom Loss Functions**: Beyond standard language modeling loss (e.g., adding regularization, multi-objective training).
-- **Multi-task Learning**: Training models on multiple tasks simultaneously with different loss components.
-- **Custom Data Processing**: Specialized batch preprocessing for domain-specific data formats.
-- **Additional Metrics**: Computing extra evaluation metrics during training.
-- **Model-specific Logic**: Special handling for custom model architectures or training procedures.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-from megatron.core.activations import squared_relu
-
-from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider
-from megatron.bridge.peft.base import PEFT
-from megatron.bridge.peft.lora import LoRA
-from megatron.bridge.recipes.common import _peft_common, _pretrain_common, _sft_common
-from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.config import ConfigContainer
-
-
-def nemotron_3_nano_pretrain_config() -> ConfigContainer:
-    """Return a pre-training config for Nemotron 3 Nano (30B-A3B MoE).
-
-    This is a MoE (Mixture of Experts) model with the following default parallelism:
-    - TP=4, PP=1, EP=8, SP=True
-    - DeepEP enabled for MoE token dispatch
-
-    Returns:
-        ConfigContainer: Pre-training configuration for Nemotron 3 Nano.
-    """
-    cfg = _pretrain_common()
-
-    # Model Configuration (MoE)
-    cfg.model = MambaModelProvider(
-        # Architecture (Nemotron 3 Nano 30B-A3B)
-        hybrid_layer_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
-        num_layers=52,
-        hidden_size=2688,
-        mamba_num_heads=64,
-        kv_channels=128,
-        mamba_state_dim=128,
-        ffn_hidden_size=1856,
-        num_attention_heads=32,
-        mamba_head_dim=64,
-        seq_length=8192,
-        num_query_groups=2,
-        # MoE
-        num_moe_experts=128,
-        moe_ffn_hidden_size=1856,
-        moe_shared_expert_intermediate_size=3712,
-        moe_router_topk=6,
-        moe_router_topk_scaling_factor=2.5,
-        moe_router_num_groups=1,
-        moe_router_group_topk=1,
-        # NemotronH base
-        mamba_num_groups=8,
-        make_vocab_size_divisible_by=128,
-        activation_func=squared_relu,
-        masked_softmax_fusion=True,
-        apply_query_key_layer_scaling=False,
-        persist_layer_norm=True,
-        attention_softmax_in_fp32=False,
-        first_last_layers_bf16=True,
-        is_hybrid_model=True,
-        moe_aux_loss_coeff=0.0001,
-        moe_router_score_function="sigmoid",
-        moe_router_enable_expert_bias=True,
-        moe_router_load_balancing_type="seq_aux_loss",
-        moe_router_dtype="fp32",
-        moe_grouped_gemm=True,
-        moe_token_dispatcher_type="alltoall",
-        moe_permute_fusion=True,
-        moe_shared_expert_overlap=True,
-        # Parallelism
-        tensor_model_parallel_size=4,
-        pipeline_model_parallel_size=1,
-        pipeline_dtype=torch.bfloat16,
-        virtual_pipeline_model_parallel_size=None,
-        context_parallel_size=1,
-        sequence_parallel=True,
-        expert_tensor_parallel_size=1,
-        expert_model_parallel_size=8,
-    )
-
-    # Tokenizer (--tokenizer-model)
-    cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-
-    # Dataset Configuration
-    cfg.dataset.seq_length = 8192
-    cfg.dataset.blend = None  # Pass the path to the dataset here if not using mock data, along with weight. Ex: (["path/to/data1"], 0.2), [("path/to/data2", 0.8)]
-    cfg.dataset.num_workers = 8
-    cfg.dataset.mmap_bin_files = False
-
-    # Parallelism Settings (MoE-specific)
-    cfg.model.pipeline_model_parallel_layout = None
-
-    # MoE Token Dispatcher Settings
-    cfg.model.moe_token_dispatcher_type = "flex"
-    cfg.model.moe_flex_dispatcher_backend = "deepep"
-    cfg.model.moe_hybridep_num_sms = 16
-
-    # Training Configuration
-    cfg.train.train_iters = 39735
-    cfg.train.global_batch_size = 3072
-    cfg.train.micro_batch_size = 2
-    cfg.train.manual_gc = False
-    cfg.train.manual_gc_interval = 0
-
-    # Transformer Engine (TE)
-    cfg.model.transformer_impl = "transformer_engine"
-
-    # CUDA Graph
-    cfg.model.cuda_graph_impl = "none"
-    cfg.model.cuda_graph_scope = "full"
-    cfg.model.cuda_graph_warmup_steps = 3
-
-    # Kernel Selections
-    cfg.model.attention_backend = "fused"
-    cfg.model.moe_router_fusion = False
-    cfg.model.moe_permute_fusion = True
-    cfg.model.moe_grouped_gemm = True
-    cfg.model.cross_entropy_loss_fusion = True
-    cfg.model.cross_entropy_fusion_impl = "native"
-
-    # Memory Saving (recompute & offloading)
-    cfg.model.recompute_granularity = None
-    cfg.model.recompute_modules = None
-    cfg.model.fine_grained_activation_offloading = False
-    cfg.model.offload_modules = None
-
-    # =========================================================================
-    # FP8 & MXFP8 (Mixed Precision Settings)
-    # =========================================================================
-    # Note: mixed_precision="bf16_mixed" is set in _pretrain_common as default
-    # FP8 settings (disabled by default, uncomment to enable)
-    # cfg.mixed_precision.fp8_recipe = "tensorwise"
-    # cfg.mixed_precision.fp8 = None
-    # cfg.mixed_precision.fp8_param_gather = False
-    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
-    cfg.model.moe_router_padding_for_fp8 = False
-
-    # Optimizer Precision Settings
-    cfg.optimizer.use_precision_aware_optimizer = False
-    cfg.optimizer.main_grads_dtype = torch.float32
-    cfg.optimizer.main_params_dtype = torch.float32
-    cfg.optimizer.exp_avg_dtype = torch.float32
-    cfg.optimizer.exp_avg_sq_dtype = torch.float32
-
-    # Optimizer hyperparameters
-    cfg.optimizer.lr = 1.6e-3
-    cfg.optimizer.weight_decay = 0.1
-    cfg.optimizer.min_lr = 1.6e-5
-    cfg.scheduler.lr_warmup_iters = 333
-
-    # Communication Overlap
-    cfg.comm_overlap = CommOverlapConfig(
-        tp_comm_bootstrap_backend="nccl",
-        tp_comm_overlap=True,
-    )
-    cfg.comm_overlap.delay_wgrad_compute = False
-    cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
-    cfg.model.moe_shared_expert_overlap = False
-
-    # Checkpoint Configuration
-    # Paths are set in _pretrain_common by default. Override here if needed:
-    # cfg.checkpoint.load = "path/to/load"
-    # cfg.checkpoint.save = "path/to/save"
-    cfg.checkpoint.save_interval = 200
-    cfg.checkpoint.ckpt_assume_constant_structure = True
-    cfg.checkpoint.dist_ckpt_strictness = "log_all"
-
-    # DDP Configuration
-    cfg.ddp.overlap_grad_reduce = True
-    cfg.ddp.overlap_param_gather = True
-    cfg.ddp.check_for_nan_in_grad = True
-    cfg.ddp.use_distributed_optimizer = True
-
-    # MoE Force Load Balancing
-    cfg.model.moe_router_force_load_balancing = False
-
-    cfg.model.init_method_std = 0.0173
-    cfg.model.apply_rope_fusion = False
-    cfg.model.use_fused_weighted_squared_relu = True
-
-    return cfg
-
-
-# =============================================================================
-# SFT Config
-# =============================================================================
-
-
-def nemotron_3_nano_sft_config() -> ConfigContainer:
-    """Return a full SFT config for Nemotron 3 Nano (30B-A3B MoE).
-
-    Default parallelism: TP=1, PP=1, EP=8, SP=False
-
-    Returns:
-        ConfigContainer with all settings pre-configured for Nemotron 3 Nano SFT.
-    """
-    cfg = _sft_common()
-
-    # Model config - Nemotron 3 Nano
-    cfg.model = MambaModelProvider(
-        # Architecture (Nemotron 3 Nano 30B-A3B)
-        hybrid_layer_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
-        num_layers=52,
-        hidden_size=2688,
-        mamba_num_heads=64,
-        kv_channels=128,
-        mamba_state_dim=128,
-        ffn_hidden_size=1856,
-        num_attention_heads=32,
-        mamba_head_dim=64,
-        seq_length=2048,
-        num_query_groups=2,
-        # MoE
-        num_moe_experts=128,
-        moe_ffn_hidden_size=1856,
-        moe_shared_expert_intermediate_size=3712,
-        moe_router_topk=6,
-        moe_router_topk_scaling_factor=2.5,
-        moe_router_num_groups=1,
-        moe_router_group_topk=1,
-        # NemotronH base
-        mamba_num_groups=8,
-        make_vocab_size_divisible_by=128,
-        activation_func=squared_relu,
-        masked_softmax_fusion=True,
-        apply_query_key_layer_scaling=False,
-        persist_layer_norm=True,
-        attention_softmax_in_fp32=False,
-        first_last_layers_bf16=True,
-        is_hybrid_model=True,
-        moe_aux_loss_coeff=0.0001,
-        moe_router_score_function="sigmoid",
-        moe_router_enable_expert_bias=True,
-        moe_router_load_balancing_type="seq_aux_loss",
-        moe_router_dtype="fp32",
-        moe_grouped_gemm=True,
-        moe_token_dispatcher_type="alltoall",
-        moe_permute_fusion=True,
-        moe_shared_expert_overlap=True,
-        # Extra config
-        apply_rope_fusion=False,
-        attention_backend="fused",
-        init_method_std=0.0173,
-        use_fused_weighted_squared_relu=True,
-        calculate_per_token_loss=True,
-        # Parallelism
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
-        pipeline_dtype=torch.bfloat16,
-        virtual_pipeline_model_parallel_size=None,
-        context_parallel_size=1,
-        sequence_parallel=False,
-        expert_tensor_parallel_size=1,
-        expert_model_parallel_size=8,
-    )
-
-    # Parallelism settings
-    cfg.model.pipeline_model_parallel_layout = None
-
-    # Sequence length
-    cfg.model.seq_length = 2048
-
-    # DeePEP settings - set to True to enable DeePEP (enabled by default for Nemotron)
-    enable_deepep = True
-    if enable_deepep:
-        cfg.model.moe_token_dispatcher_type = "flex"
-        cfg.model.moe_flex_dispatcher_backend = "deepep"
-        cfg.model.moe_shared_expert_overlap = False
-    else:
-        cfg.model.moe_token_dispatcher_type = "alltoall"
-        cfg.model.moe_flex_dispatcher_backend = None
-        cfg.model.moe_shared_expert_overlap = True
-
-    cfg.model.moe_hybridep_num_sms = 16
-
-    # TE (Transformer Engine)
-    cfg.model.transformer_impl = "transformer_engine"
-
-    # CUDA Graph
-    cfg.model.cuda_graph_impl = "none"
-    cfg.model.cuda_graph_scope = "full"
-    cfg.model.cuda_graph_warmup_steps = 3
-
-    # Kernel selections
-    cfg.model.attention_backend = "fused"
-    cfg.model.moe_router_fusion = False
-    cfg.model.moe_permute_fusion = True
-    cfg.model.moe_grouped_gemm = True
-    cfg.model.cross_entropy_loss_fusion = True
-    cfg.model.cross_entropy_fusion_impl = "native"
-
-    # Memory saving (recompute & offloading)
-    cfg.model.recompute_granularity = None
-    cfg.model.recompute_modules = None
-    cfg.model.fine_grained_activation_offloading = False
-    cfg.model.offload_modules = None
-
-    # FP8 & MXFP8 settings
-    # Note: mixed_precision="bf16_mixed" is set as default
-    # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default
-    # cfg.mixed_precision.fp8_recipe = "tensorwise"
-    # cfg.mixed_precision.fp8 = None
-    # cfg.mixed_precision.fp8_param_gather = False
-    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
-    cfg.optimizer.use_precision_aware_optimizer = False
-    cfg.optimizer.main_grads_dtype = torch.float32
-    cfg.optimizer.main_params_dtype = torch.float32
-    cfg.optimizer.exp_avg_dtype = torch.float32
-    cfg.optimizer.exp_avg_sq_dtype = torch.float32
-    cfg.model.moe_router_padding_for_fp8 = False
-
-    # MoE Force Load Balancing
-    cfg.model.moe_router_force_load_balancing = False
-
-    # Training config overrides
-    cfg.validation.eval_interval = 500
-
-    # Dataset config - packed_sequence=True by default (from _sft_common), seq_length=2048
-    # _sft_common already sets seq_length=2048 and packed_sequence=True
-    # Adjust pad_seq_to_mult for context parallelism
-    if cfg.model.context_parallel_size > 1:
-        cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2
-
-    # Optimizer overrides - Nemotron uses specific optimizer settings
-    cfg.optimizer.adam_beta2 = 0.95
-    cfg.optimizer.adam_eps = 1e-8
-    cfg.optimizer.weight_decay = 0.1
-    cfg.scheduler.start_weight_decay = 0.1
-    cfg.scheduler.end_weight_decay = 0.1
-    cfg.scheduler.lr_decay_style = "cosine"
-
-    # Tokenizer
-    cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-
-    # Checkpoint config overrides
-    cfg.checkpoint.save_interval = 200
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.dist_ckpt_strictness = "log_all"
-    cfg.checkpoint.ckpt_assume_constant_structure = True
-    # Uncomment below if using a pretrained checkpoint and provide path to the directory containing pretrained model for finetuning
-    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
-
-    # Logger config
-    cfg.logger.log_interval = 10
-    cfg.logger.log_timers_to_tensorboard = False
-
-    # RNG config - Nemotron uses seed 1234
-    cfg.rng.seed = 1234
-
-    # DDP config
-    cfg.ddp.check_for_nan_in_grad = True
-    cfg.ddp.grad_reduce_in_fp32 = True
-    cfg.ddp.overlap_grad_reduce = True
-    cfg.ddp.overlap_param_gather = True
-    cfg.ddp.use_distributed_optimizer = True
-
-    # Communication overlap settings(default None, can pass CommOverlapConfig for advanced overlap), uncomment to enable
-    # cfg.comm_overlap = CommOverlapConfig(
-    #     tp_comm_bootstrap_backend="nccl",
-    #     tp_comm_overlap=True,
-    # )
-    # cfg.comm_overlap.delay_wgrad_compute = False
-    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
-
-    return cfg
-
-
-# =============================================================================
-# PEFT Config
-# =============================================================================
-
-
-def nemotron_3_nano_peft_config(
-    peft_scheme: str | PEFT = "lora",
-) -> ConfigContainer:
-    """Return a PEFT config for Nemotron 3 Nano (30B-A3B MoE).
-
-    Default parallelism: TP=1, PP=1, EP=8, SP=False
-
-    Args:
-        peft_scheme: PEFT scheme - "lora", "dora", or a custom PEFT instance.
-
-    Returns:
-        ConfigContainer with all settings pre-configured for Nemotron 3 Nano PEFT.
-    """
-    cfg = _peft_common()
-
-    # Model config - PEFT uses same parallelism as SFT
-    cfg.model = MambaModelProvider(
-        # Architecture (Nemotron 3 Nano 30B-A3B)
-        hybrid_layer_pattern="MEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEM*EMEMEMEM*EMEMEMEME",
-        num_layers=52,
-        hidden_size=2688,
-        mamba_num_heads=64,
-        kv_channels=128,
-        mamba_state_dim=128,
-        ffn_hidden_size=1856,
-        num_attention_heads=32,
-        mamba_head_dim=64,
-        seq_length=2048,
-        num_query_groups=2,
-        # MoE
-        num_moe_experts=128,
-        moe_ffn_hidden_size=1856,
-        moe_shared_expert_intermediate_size=3712,
-        moe_router_topk=6,
-        moe_router_topk_scaling_factor=2.5,
-        moe_router_num_groups=1,
-        moe_router_group_topk=1,
-        # NemotronH base
-        mamba_num_groups=8,
-        make_vocab_size_divisible_by=128,
-        activation_func=squared_relu,
-        masked_softmax_fusion=True,
-        apply_query_key_layer_scaling=False,
-        persist_layer_norm=True,
-        attention_softmax_in_fp32=False,
-        first_last_layers_bf16=True,
-        is_hybrid_model=True,
-        moe_aux_loss_coeff=0.0001,
-        moe_router_score_function="sigmoid",
-        moe_router_enable_expert_bias=True,
-        moe_router_load_balancing_type="seq_aux_loss",
-        moe_router_dtype="fp32",
-        moe_grouped_gemm=True,
-        moe_token_dispatcher_type="alltoall",
-        moe_permute_fusion=True,
-        moe_shared_expert_overlap=True,
-        # Extra config
-        apply_rope_fusion=False,
-        attention_backend="fused",
-        init_method_std=0.0173,
-        use_fused_weighted_squared_relu=True,
-        calculate_per_token_loss=True,
-        # Parallelism
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
-        pipeline_dtype=torch.bfloat16,
-        virtual_pipeline_model_parallel_size=None,
-        context_parallel_size=1,
-        sequence_parallel=False,
-        expert_tensor_parallel_size=1,
-        expert_model_parallel_size=8,
-    )
-
-    # Parallelism settings
-    cfg.model.pipeline_model_parallel_layout = None
-
-    # Sequence length
-    cfg.model.seq_length = 2048
-
-    # DeePEP settings - set to True to enable DeePEP (enabled by default for Nemotron)
-    enable_deepep = True
-    if enable_deepep:
-        cfg.model.moe_token_dispatcher_type = "flex"
-        cfg.model.moe_flex_dispatcher_backend = "deepep"
-        cfg.model.moe_shared_expert_overlap = False
-    else:
-        cfg.model.moe_token_dispatcher_type = "alltoall"
-        cfg.model.moe_flex_dispatcher_backend = None
-        cfg.model.moe_shared_expert_overlap = True
-
-    cfg.model.moe_hybridep_num_sms = 16
-
-    # TE (Transformer Engine)
-    cfg.model.transformer_impl = "transformer_engine"
-
-    # CUDA Graph
-    cfg.model.cuda_graph_impl = "none"
-    cfg.model.cuda_graph_scope = "full"
-    cfg.model.cuda_graph_warmup_steps = 3
-
-    # Kernel selections
-    cfg.model.attention_backend = "fused"
-    cfg.model.moe_router_fusion = False
-    cfg.model.moe_permute_fusion = True
-    cfg.model.moe_grouped_gemm = True
-    cfg.model.cross_entropy_loss_fusion = True
-    cfg.model.cross_entropy_fusion_impl = "native"
-
-    # Memory saving
-    cfg.model.recompute_granularity = None
-    cfg.model.recompute_modules = None
-    cfg.model.fine_grained_activation_offloading = False
-    cfg.model.offload_modules = None
-
-    # FP8 & MXFP8 settings
-    # These are defaults for FP8, enable them if using FP8 - FP8 is not enabled by default
-    # cfg.mixed_precision.fp8_recipe = "tensorwise"
-    # cfg.mixed_precision.fp8 = None
-    # cfg.mixed_precision.fp8_param_gather = False
-    # cfg.mixed_precision.reuse_grad_buf_for_mxfp8_param_ag = False
-    cfg.optimizer.use_precision_aware_optimizer = False
-    cfg.optimizer.main_grads_dtype = torch.float32
-    cfg.optimizer.main_params_dtype = torch.float32
-    cfg.optimizer.exp_avg_dtype = torch.float32
-    cfg.optimizer.exp_avg_sq_dtype = torch.float32
-    cfg.model.moe_router_padding_for_fp8 = False
-
-    # MoE Force Load Balancing
-    cfg.model.moe_router_force_load_balancing = False
-
-    # PEFT config - Nemotron uses Mamba-specific target modules
-    mamba_target_modules = ["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"]
-    if isinstance(peft_scheme, str) and peft_scheme.lower() in ["lora", "dora"]:
-        cfg.peft = default_peft_config(peft_scheme, target_modules=mamba_target_modules)
-    elif isinstance(peft_scheme, PEFT):
-        cfg.peft = peft_scheme
-    else:
-        # Default to LoRA with Mamba target modules
-        cfg.peft = LoRA(
-            target_modules=mamba_target_modules,
-            dim=32,
-            alpha=32,
-            dropout=0.0,
-            dropout_position="pre",
-            lora_A_init_method="xavier",
-            lora_B_init_method="zero",
-        )
-
-    # Training config overrides
-    cfg.validation.eval_interval = 500
-
-    # Dataset config - packed_sequence=True by default (from _peft_common), seq_length=2048
-    # _peft_common already sets seq_length=2048 and packed_sequence=True
-    # Adjust pad_seq_to_mult for context parallelism
-    if cfg.model.context_parallel_size > 1:
-        cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2
-
-    # Optimizer overrides
-    cfg.optimizer.adam_beta2 = 0.95
-    cfg.optimizer.adam_eps = 1e-8
-    cfg.optimizer.weight_decay = 0.1
-    cfg.scheduler.start_weight_decay = 0.1
-    cfg.scheduler.end_weight_decay = 0.1
-    cfg.scheduler.lr_decay_style = "cosine"
-
-    # Tokenizer
-    cfg.tokenizer.tokenizer_model = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
-
-    # Checkpoint config overrides
-    cfg.checkpoint.save_interval = 200
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.dist_ckpt_strictness = "log_all"
-    cfg.checkpoint.ckpt_assume_constant_structure = True
-    # Uncomment below if using a pretrained checkpoint and provide path to the directory containing pretrained model for finetuning
-    # cfg.checkpoint.pretrained_checkpoint = "/path/to/checkpoint"
-
-    # Logger config
-    cfg.logger.log_interval = 10
-    cfg.logger.log_timers_to_tensorboard = False
-
-    # RNG config - Nemotron uses seed 1234
-    cfg.rng.seed = 1234
-
-    # DDP config
-    cfg.ddp.check_for_nan_in_grad = True
-    cfg.ddp.grad_reduce_in_fp32 = True
-    cfg.ddp.overlap_grad_reduce = True
-    cfg.ddp.overlap_param_gather = True
-    cfg.ddp.use_distributed_optimizer = True
-
-    # Communication overlap settings(default None, can pass CommOverlapConfig for advanced overlap), uncomment to enable
-    # cfg.comm_overlap = CommOverlapConfig(
-    #     tp_comm_bootstrap_backend="nccl",
-    #     tp_comm_overlap=True,
-    # )
-    # cfg.comm_overlap.delay_wgrad_compute = False
-    # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
-
-    return cfg
-
-
-__all__ = [
-    # Pretrain config
-    "nemotron_3_nano_pretrain_config",
-    # SFT config
-    "nemotron_3_nano_sft_config",
-    # PEFT config
-    "nemotron_3_nano_peft_config",
-]
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/performant_lora.png
-```png
-[Binary file]
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/tp_comm_overlap.png
-```png
-[Binary file]
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/callbacks.md
-```md
-# Callbacks
-
-Megatron Bridge provides a lightweight callback system for injecting custom logic into the training and evaluation loop without modifying framework code. This is ideal for propietary integrations or custom logging and metrics tracking.
-
-## Quick Start
-
-### Class-Based Callbacks
-
-Subclass {py:class}`bridge.training.callbacks.Callback` and override event methods:
-
-```python
-import time
-
-from megatron.bridge.training.callbacks import Callback
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.recipes.qwen import qwen25_500m_pretrain_config
-
-class MyCallback(Callback):
-    def on_train_start(self, context):
-        context.user_state['start_time'] = time.time()
-        print(f"Training started at step {context.state.train_state.step}")
-
-    def on_train_step_end(self, context):
-        if context.loss_dict:
-            print(f"Step {context.state.train_state.step}: loss={context.loss_dict}")
-
-    def on_train_end(self, context):
-        elapsed = time.time() - context.user_state['start_time']
-        print(f"Training completed in {elapsed:.2f}s")
-
-# Create a config that fits on a single GPU
-config = qwen25_500m_pretrain_config()
-
-# Pass callbacks to pretrain
-pretrain(config, forward_step, callbacks=[MyCallback()])
-```
-
-### Functional Callbacks
-
-Register functions directly with {py:class}`bridge.training.callbacks.CallbackManager`:
-
-```python
-from megatron.bridge.training.callbacks import CallbackManager
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.recipes.qwen import qwen25_500m_pretrain_config
-
-def log_step(context):
-    step = context.state.train_state.step
-    if context.loss_dict:
-        print(f"Step {step}: {context.loss_dict}")
-
-callback_manager = CallbackManager()
-callback_manager.register("on_train_step_end", log_step)
-
-# Create a config that fits on a single GPU
-config = qwen25_500m_pretrain_config()
-
-pretrain(config, forward_step, callbacks=callback_manager)
-```
-
-### Mixing Both Patterns
-
-Both registration patterns can be combined:
-
-```python
-from megatron.bridge.training.callbacks import CallbackManager
-from megatron.bridge.training.gpt_step import forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.recipes.qwen import qwen25_500m_pretrain_config
-
-manager = CallbackManager()
-manager.add(MyCallback())
-manager.add([TimingCallback(), MetricsCallback()])
-manager.register("on_eval_end", lambda ctx: print("Evaluation complete!"))
-
-# Create a config that fits on a single GPU
-config = qwen25_500m_pretrain_config()
-
-pretrain(config, forward_step, callbacks=manager)
-```
-
-## Available Events
-
-### Training Events
-
-| Event | When Fired | Available Context Fields |
-|-------|------------|-------------------------|
-| `on_train_start` | After `model.train()`, before training loop | `state`, `model`, `user_state`, `optimizer`, `scheduler` |
-| `on_train_step_start` | Before each training step | `state`, `model`, `user_state`, `optimizer`, `scheduler` |
-| `on_train_step_end` | After each training step | `state`, `model`, `user_state`, `optimizer`, `scheduler`, `loss_dict`, `grad_norm`, `skipped_iter` |
-| `on_train_end` | After training loop completes | `state`, `model`, `user_state`, `optimizer`, `scheduler` |
-
-### Validation Events
-
-| Event | When Fired | Available Context Fields |
-|-------|------------|-------------------------|
-| `on_eval_start` | After `model.eval()`, before validation loop | `state`, `model`, `user_state` |
-| `on_eval_step_start` | Before each validation step | `state`, `model`, `user_state` |
-| `on_eval_step_end` | After each validation step | `state`, `model`, `user_state` |
-| `on_eval_end` | After validation completes | `state`, `model`, `user_state`, `total_loss_dict` |
-
-### Test Events
-
-| Event | When Fired | Available Context Fields |
-|-------|------------|-------------------------|
-| `on_test_start` | After `model.eval()`, before test loop | `state`, `model`, `user_state` |
-| `on_test_step_start` | Before each test step | `state`, `model`, `user_state` |
-| `on_test_step_end` | After each test step | `state`, `model`, `user_state` |
-| `on_test_end` | After test completes | `state`, `model`, `user_state`, `total_loss_dict` |
-
-### Checkpoint Events
-| Event | When Fired | Available Context Fields |
-|-------|------------|-------------------------|
-| `on_checkpoint_save` | When checkpoint was saved| `state`, `model`, `user_state`, `optimizer` |
-
-
-## CallbackContext
-
-The {py:class}`bridge.training.callbacks.CallbackContext` provides access to framework state:
-
-### Always Available
-
-- **`state`**: {py:class}`bridge.training.state.GlobalState` - Contains config, train_state, timers, and loggers
-- **`model`**: List of model chunks
-- **`user_state`**: Mutable dict for storing data across callback invocations
-
-### Training Events Only
-
-- **`optimizer`**: The optimizer instance
-- **`scheduler`**: Learning rate scheduler
-
-### Event-Specific Fields
-
-- **`loss_dict`** (`on_train_step_end`): Dictionary of reduced losses from the training step
-- **`grad_norm`** (`on_train_step_end`): Gradient norm (if computed)
-- **`skipped_iter`** (`on_train_step_end`): Whether the iteration was skipped
-- **`total_loss_dict`** (`on_eval_end`, `on_test_end`): Aggregated evaluation/test losses
-
-## User State
-
-The `CallbackManager` owns a `user_state` dictionary that persists across all callback invocations during a training run. Use it to share data between callbacks or accumulate metrics:
-
-```python
-class StepCounterCallback(Callback):
-    def on_train_start(self, context):
-        context.user_state['callback_step_count'] = 0
-
-    def on_train_step_end(self, context):
-        context.user_state['callback_step_count'] += 1
-
-    def on_train_end(self, context):
-        print(f"Callback saw {context.user_state['callback_step_count']} steps")
-```
-
-## Distributed Training
-
-Callbacks fire on **all ranks** without framework-level synchronization. If your callback should only run on specific ranks, add guards:
-
-```python
-import torch.distributed as dist
-
-class RankZeroCallback(Callback):
-    def on_train_step_end(self, context):
-        if dist.get_rank() == 0:
-            print(f"Step {context.state.train_state.step} complete")
-```
-
-## Exception Handling
-
-Exceptions from callbacks propagate to the caller. The framework does not catch or handle callback exceptions. If your callback might fail, wrap it in a try-except:
-
-```python
-def safe_callback(context):
-    try:
-        # Your logic here
-        external_service.log(context.loss_dict)
-    except Exception as e:
-        print(f"Callback failed: {e}")
-        # Don't re-raise to avoid stopping training
-```
-
-## Execution Order
-
-Callbacks fire in registration order:
-
-1. Callbacks added via `add()` fire in the order they were added
-2. Callbacks registered via `register()` fire in the order they were registered
-3. If both methods are used, the order depends on when each was called
-
-## Introspection
-
-Query registered callbacks:
-
-```python
-manager = CallbackManager()
-manager.register("on_train_start", my_fn)
-
-# Check if any callbacks exist for an event
-if manager.has_callbacks("on_train_start"):
-    print("Callbacks registered for on_train_start")
-
-# List all callbacks for an event
-callbacks = manager.list_callbacks("on_train_start")
-print(f"Found {len(callbacks)} callbacks")
-
-# Get all valid event names
-print(manager.events)  # frozenset of valid event names
-```
-
-## Design Principles
-
-The callback system follows these principles:
-
-1. **First-Party Isolation**: Framework code never uses callbacks for its own logic. Callbacks are strictly for third-party extensions.
-
-2. **Zero Overhead**: When no callbacks are registered, there is zero performance overhead.
-
-3. **Safety**: Callbacks receive framework state but modifying it is at the user's own risk. The framework makes no guarantees about the effects of modifications.
-
-## Examples
-
-### Proprietary Metrics
-
-```python
-class ProprietaryMetricsCallback(Callback):
-    """Send metrics to internal monitoring system."""
-
-    def __init__(self, endpoint: str):
-        self.client = InternalMetricsClient(endpoint)
-
-    def on_train_step_end(self, context):
-        if context.loss_dict:
-            self.client.send({
-                "step": context.state.train_state.step,
-                "loss": context.loss_dict.get("lm loss"),
-                "grad_norm": context.grad_norm,
-                "cluster_id": os.environ.get("CLUSTER_ID"),
-            })
-```
-
-## API Reference
-
-- {py:class}`bridge.training.callbacks.Callback`
-- {py:class}`bridge.training.callbacks.CallbackContext`
-- {py:class}`bridge.training.callbacks.CallbackManager`
-- {py:func}`bridge.training.callbacks.normalize_callbacks`
-- {py:func}`bridge.training.callbacks.should_fire`
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/nemotron3.md
-```md
-# Nemotron 3 Nano 
-[Nemotron 3 Nano](https://huggingface.co/collections/nvidia/nvidia-nemotron-v3) is a large language model (LLM) trained from scratch by NVIDIA, and designed as a unified model for both reasoning and non-reasoning tasks. The model employs a hybrid Mixture-of-Experts (MoE) architecture, consisting of 23 Mamba-2 and MoE layers, along with 6 Attention layers. Each MoE layer includes 128 experts plus 1 shared expert, with 5 experts activated per token. The model has 3.5B active parameters and 30B parameters in total. 
-
-NeMo Megatron Bridge supports pretraining, full parameters finetuning, and LoRA finetuning this model. The finetuned model can be converted back to the 🤗 Hugging Face format for downstream evaluation.
-
-```{important}
-Please use the custom container `nvcr.io/nvidia/nemo:25.11.nemotron_3_nano` when working with this model.
-
-Run all commands from `/opt/Megatron-Bridge` (e.g. `docker run -w /opt/Megatron-Bridge ...`)
-```
-
-```{tip}
-We use the following environment variables throughout this page
-- `HF_MODEL_ID=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16`
-- `MEGATRON_MODEL_PATH=/models/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` (feel free to set your own path)
-```
-
-
-## Conversion with 🤗 Hugging Face
-
-### Import HF → Megatron
-To import the HF model to your desired `$MEGATRON_MODEL_PATH`, run the following command.
-```bash
-python examples/conversion/convert_checkpoints.py import  \
---hf-model $HF_MODEL_ID  \
---megatron-path /path/to/output/megatron/ckpt \
---trust-remote-code
-```
-
-### Export Megatron → HF
-```bash
-python examples/conversion/convert_checkpoints.py export  \
---hf-model $HF_MODEL_ID  \
---megatron-path /path/to/trained/megatron/ckpt \
---hf-path /path/to/output/hf/ckpt
-```
-
-## Pretraining Examples
-```bash
-BLEND_PATH=/path/to/dataset/blend
-TOKENIZER_MODEL=/path/to/tiktok/tokenizer/model
-
-torchrun --nproc-per-node=8 examples/models/nemotron_3/pretrain_nemotron_3_nano.py \
---per-split-data-args-path=${BLEND_PATH} \
---tokenizer-model=${TOKENIZER_MODEL} \
-train.global_batch_size=3072 \
-train.train_iters=39500 \
-scheduler.lr_warmup_iters=350
-```
-
-Notes: 
-- The default parallelism settings are TP=4, EP=8, PP=1, CP=1. It is recommended to run this pretraining on 4 H100 nodes (32 GPUs).
-- To enable wandb logging, you can append `logger.wandb_project=PROJECT_NAME`, `wandb_entity=ENTITY_NAME`, and `wandb_exp_name=EXP_NAME` arguments
-- If `BLEND_PATH` and `TOKENIZER_MODEL` are not specified, mock dataset will be used.
-
-
-## Finetuning Recipes
-
-### Full Parameter Fine-Tuning
-```bash
-torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \
-train.global_batch_size=128 \
-train.train_iters=100 \
-scheduler.lr_warmup_iters=10 \
-checkpoint.pretrained_checkpoint=/path/to/output/megatron/ckpt
-```
-
-Notes:
-- Default parallelism TP=1, EP=8, PP=1, CP=1. Running this recipe requires at least 2 H100 nodes (16 GPUs).
-- By default, the [SQuAD](https://huggingface.co/datasets/rajpurkar/squad) dataset is used. To use customerized dataset, see this [tutorial](https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/tutorials/recipes/llama#quickstart)
-- Fine-tuning requires a pretrained megatron checkpoint, which can be obtained in "Import HF → Megatron" section above
-
-
-### LoRA Fine-Tuning
-To enable LoRA fine-tuning, pass `--peft lora` to script
-```bash
-torchrun --nproc-per-node=8 examples/models/nemotron_3/finetune_nemotron_3_nano.py \
---peft lora \
-train.global_batch_size=128 \
-train.train_iters=100 \
-scheduler.lr_warmup_iters=10 \
-checkpoint.pretrained_checkpoint=/path/to/output/megatron/ckpt
-```
-
-Notes:
-- By default, the target modules are linear layers `["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2", "in_proj", "out_proj"]` in the model
-- The rest of settings are the same as full parameter fine-tuning above.
-
-
-A LoRA checkpoint only contains the learnable adapter weights. In order to convert the LoRA checkpoint to Hugging Face format for downstream evaluation, it is necessary to merge the LoRA adapters back to the base model.
-
-```bash
-python examples/peft/merge_lora.py \
---hf-model-path $HF_MODEL_ID \
---lora-checkpoint /path/to/lora/ckpt/iter_xxxxxxx 
---output /path/to/merged/ckpt
-```
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/index.md
-```md
-```{include} ../README.md
-:relative-docs: docs/
-```
-
-```{toctree}
-:caption: Guides
-:hidden:
-
-parallelisms.md
-performance-summary.md
-performance-summary-archive.md
-performance-guide.md
-recipe-usage.md
-nemo2-migration-guide.md
-megatron-lm-to-megatron-bridge.md
-```
-
-```{toctree}
-:caption: Bridge with 🤗 Hugging Face
-:hidden:
-
-bridge-guide.md
-bridge-tech-details.md
-```
-
-```{toctree}
-:caption: Supported Models
-:hidden:
-
-models/llm/index.md
-models/vlm/index.md
-```
-
-```{toctree}
-:caption: Training and Customization
-:hidden:
-
-training/config-container-overview.md
-training/entry-points.md
-training/training-loop-settings.md
-training/optimizer-scheduler.md
-training/logging.md
-training/profiling.md
-training/checkpointing.md
-training/megatron-fsdp.md
-training/resiliency.md
-training/mixed-precision.md
-training/cuda-graphs.md
-training/hybrid-context-parallel.md
-training/communication-overlap.md
-training/attention-optimizations.md
-training/activation-recomputation.md
-training/cpu-offloading.md
-training/peft.md
-training/packed-sequences.md
-training/multi-token-prediction.md
-training/distillation.md
-training/pruning.md
-training/callbacks.md
-```
-
-```{toctree}
-:caption: Model Optimization with ModelOpt
-:hidden:
-
-modelopt/quantization.md
-```
-
-```{toctree}
-:caption: Development
-:hidden:
-
-adding-new-models.md
-bridge-rl-integration.md
-documentation.md
-apidocs/index.rst
-```
-
-```{toctree}
-:caption: Releases
-:hidden:
-
-releases/release-process.md
-releases/software-versions.md
-releases/changelog.md
-releases/known-issues.md
-```
-
-```{toctree}
-:caption: Agent Skills
-:hidden:
-
-skills-index
-```
-
-```{toctree}
-:caption: Directory Readme Files
-:hidden:
-
-README.md
-models/README.md
-models/llm/README.md
-models/vlm/README.md
-releases/README.md
-training/README.md
-modelopt/README.md
-```
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/__init__.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Nemotron Nano v2 models
-# Nemotron 3 Nano models
-from megatron.bridge.recipes.nemotronh.nemotron_3_nano import (
-    nemotron_3_nano_peft_config,
-    nemotron_3_nano_pretrain_config,
-    nemotron_3_nano_sft_config,
-)
-
-# Nemotron 3 Super models
-from megatron.bridge.recipes.nemotronh.nemotron_3_super import (
-    nemotron_3_super_peft_config,
-    nemotron_3_super_pretrain_config,
-    nemotron_3_super_sft_config,
-)
-from megatron.bridge.recipes.nemotronh.nemotron_nano_v2 import (
-    nemotron_nano_9b_v2_peft_config,
-    nemotron_nano_9b_v2_pretrain_config,
-    nemotron_nano_9b_v2_sft_config,
-    nemotron_nano_12b_v2_peft_config,
-    nemotron_nano_12b_v2_pretrain_config,
-    nemotron_nano_12b_v2_sft_config,
-)
-
-# NemotronH models
-from megatron.bridge.recipes.nemotronh.nemotronh import (
-    nemotronh_4b_peft_config,
-    nemotronh_4b_pretrain_config,
-    nemotronh_4b_sft_config,
-    nemotronh_8b_peft_config,
-    nemotronh_8b_pretrain_config,
-    nemotronh_8b_sft_config,
-    nemotronh_47b_peft_config,
-    nemotronh_47b_pretrain_config,
-    nemotronh_47b_sft_config,
-    nemotronh_56b_peft_config,
-    nemotronh_56b_pretrain_config,
-    nemotronh_56b_sft_config,
-)
-
-
-__all__ = [
-    # NemotronH models
-    "nemotronh_4b_pretrain_config",
-    "nemotronh_8b_pretrain_config",
-    "nemotronh_47b_pretrain_config",
-    "nemotronh_56b_pretrain_config",
-    "nemotronh_4b_sft_config",
-    "nemotronh_8b_sft_config",
-    "nemotronh_47b_sft_config",
-    "nemotronh_56b_sft_config",
-    "nemotronh_4b_peft_config",
-    "nemotronh_8b_peft_config",
-    "nemotronh_47b_peft_config",
-    "nemotronh_56b_peft_config",
-    # Nemotron Nano v2 models
-    "nemotron_nano_9b_v2_pretrain_config",
-    "nemotron_nano_12b_v2_pretrain_config",
-    "nemotron_nano_9b_v2_sft_config",
-    "nemotron_nano_12b_v2_sft_config",
-    "nemotron_nano_9b_v2_peft_config",
-    "nemotron_nano_12b_v2_peft_config",
-    # Nemotron 3 Nano models
-    "nemotron_3_nano_pretrain_config",
-    "nemotron_3_nano_sft_config",
-    "nemotron_3_nano_peft_config",
-    # Nemotron 3 Super models
-    "nemotron_3_super_pretrain_config",
-    "nemotron_3_super_sft_config",
-    "nemotron_3_super_peft_config",
-]
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/models/llm/index.md
-```md
-# Large Language Models
-
-This section documents Large Language Models supported by Megatron Bridge, with examples for converting to/from 🤗 Hugging Face and links to training recipes.
-
-```{toctree}
-:hidden:
-
-deepseek-v2.md
-deepseek-v3.md
-gemma2.md
-gemma3.md
-glm45.md
-gpt-oss.md
-llama3.md
-llama-nemotron.md
-mistral.md
-moonlight.md
-nemotron3.md
-nemotron3-super.md
-nemotronh.md
-olmoe.md
-qwen.md
-```
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/images/activation-recomputation-example-1.jpg
-```jpg
-[Binary file]
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/utils/dataset_utils.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Dataset configuration utilities for recipes and training scripts."""
-
-import logging
-from typing import Callable, List, Optional, Tuple
-
-from megatron.bridge.data.energon.energon_provider import EnergonProvider
-from megatron.bridge.data.loaders import get_blend_and_blend_per_split
-from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
-from megatron.bridge.recipes.utils.finetune_utils import (
-    default_gsm8k_config,
-    default_openmathinstruct2_config,
-    default_squad_config,
-)
-from megatron.bridge.training.config import (
-    ConfigContainer,
-    FinetuningDatasetConfig,
-    GPTDatasetConfig,
-    MockGPTDatasetConfig,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-_BLEND_TYPE = Optional[Tuple[List[str], Optional[List[float]]]]
-_BLEND_PER_SPLIT_TYPE = Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]
-_SPLIT_TYPE = Optional[str]
-
-
-def get_blend_fields_from_data_paths(
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-) -> Tuple[_BLEND_TYPE, _BLEND_PER_SPLIT_TYPE, _SPLIT_TYPE]:
-    """
-    Common configuration logic for blend, blend_per_split, split dataset config fields.
-
-    Handles mock and real data. If no path to data is provided, mock data will be used.
-    Prioritizes `data_paths` over split data paths. For all of `data_paths`, `train_data_path`,
-    `valid_data_path`, and `test_data_path`, two formats are accepted: either (1) a list of prefixes,
-    e.g. ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], or (2) a flattened, zipped
-    list of weights and prefixes, e.g. ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"]
-
-    Args:
-        data_paths (Optional[List[str]]): List of paths to dataset files.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-
-    Returns:
-        A tuple (blend, blend_per_split, split), the corresponding fields to be passed to GPTDatasetConfig.
-    """
-    has_any_data_config = any(
-        [data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path]
-    )
-
-    if mock or not has_any_data_config:
-        # Mock data configuration
-        blend = None  # Will trigger mock mode automatically
-        blend_per_split = None  # Will trigger mock mode automatically
-        split = "1,1,1"  # Equal splits for testing
-    else:
-        # Real data configuration
-        blend, blend_per_split = get_blend_and_blend_per_split(
-            data_paths=data_paths,
-            data_args_path=data_args_path,
-            train_data_paths=train_data_path,
-            valid_data_paths=valid_data_path,
-            test_data_paths=test_data_path,
-            per_split_data_args_path=per_split_data_args_path,
-        )
-
-        if blend_per_split is not None:
-            # When using blend_per_split, split should be None
-            split = None
-        elif blend is not None:
-            # When using regular blend, we can use split
-            split = "9999,8,2"
-        else:
-            # No data provided, fall back to mock mode
-            split = "1,1,1"
-
-    return blend, blend_per_split, split
-
-
-# ---------------------------------------------------------------------------
-# Unified dataset type registry
-# ---------------------------------------------------------------------------
-
-DATASET_TYPES = [
-    "llm-pretrain",
-    "llm-pretrain-mock",
-    "llm-finetune",
-    "llm-finetune-preloaded",
-    "vlm-energon",
-    "vlm-hf",
-    "vlm-preloaded",
-]
-
-LLM_FINETUNE_PRESETS: dict[str, Callable] = {
-    "squad": default_squad_config,
-    "openmathinstruct2": default_openmathinstruct2_config,
-    "gsm8k": default_gsm8k_config,
-}
-
-
-def extract_and_remove_override(cli_overrides: list[str], key: str, default: str | None = None) -> str | None:
-    """Extract a Hydra-style override (key=value) from *cli_overrides* and remove it.
-
-    Returns the value if found, otherwise *default*.
-    """
-    prefix = f"{key}="
-    for i, override in enumerate(cli_overrides):
-        if override.startswith(prefix):
-            value = override[len(prefix) :]
-            cli_overrides.pop(i)
-            return value
-    return default
-
-
-def _resolve_seq_length(config: ConfigContainer, seq_length: int | None) -> int:
-    """Resolve sequence length: explicit arg > model config > 4096 fallback."""
-    if seq_length is not None:
-        return seq_length
-    if hasattr(config, "model") and config.model is not None and hasattr(config.model, "seq_length"):
-        return config.model.seq_length
-    return 4096
-
-
-def apply_dataset_override(
-    config: ConfigContainer,
-    dataset_type: str,
-    packed_sequence: bool = False,
-    seq_length: int | None = None,
-    cli_overrides: list[str] | None = None,
-) -> ConfigContainer:
-    """Replace the recipe's dataset config based on the requested dataset type.
-
-    Args:
-        config: The recipe config to modify.
-        dataset_type: One of :data:`DATASET_TYPES`.
-        packed_sequence: Whether to enable packed sequences.
-        seq_length: Explicit sequence length (None = use model's or default 4096).
-        cli_overrides: Mutable list of Hydra-style CLI overrides. For ``llm-finetune``,
-            ``dataset.dataset_name`` is extracted and consumed here to select the preset.
-
-    Returns:
-        The modified ConfigContainer.
-    """
-    resolved_seq_length = _resolve_seq_length(config, seq_length)
-    if cli_overrides is None:
-        cli_overrides = []
-
-    if dataset_type == "llm-pretrain":
-        config.dataset = GPTDatasetConfig(
-            seq_length=resolved_seq_length,
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            num_dataset_builder_threads=1,
-            blend=None,
-            blend_per_split=None,
-            split="9999,8,2",
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        )
-
-    elif dataset_type == "llm-pretrain-mock":
-        config.dataset = MockGPTDatasetConfig(
-            seq_length=resolved_seq_length,
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            num_dataset_builder_threads=1,
-            split="9999,8,2",
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        )
-
-    elif dataset_type == "llm-finetune":
-        preset_name = extract_and_remove_override(cli_overrides, "dataset.dataset_name", default="squad")
-        if preset_name not in LLM_FINETUNE_PRESETS:
-            raise ValueError(
-                f"Unknown finetune dataset preset: '{preset_name}'. "
-                f"Choose from: {', '.join(sorted(LLM_FINETUNE_PRESETS.keys()))}"
-            )
-        factory = LLM_FINETUNE_PRESETS[preset_name]
-        kwargs: dict = {"packed_sequence": packed_sequence, "pad_seq_to_mult": 1}
-        kwargs["seq_length"] = resolved_seq_length
-        config.dataset = factory(**kwargs)
-
-    elif dataset_type == "llm-finetune-preloaded":
-        config.dataset = FinetuningDatasetConfig(
-            seq_length=resolved_seq_length,
-            dataset_root=None,
-            dataloader_type="batch",
-            seed=5678,
-        )
-
-    elif dataset_type == "vlm-energon":
-        if isinstance(config.dataset, EnergonProvider):
-            logger.info("Recipe already provides EnergonProvider; keeping it (preserves task_encoder).")
-        else:
-            logger.warning(
-                "Creating bare EnergonProvider. task_encoder and image_processor are unset; "
-                "use a recipe that provides them or set via code."
-            )
-            config.dataset = EnergonProvider(
-                path="",
-                seq_length=resolved_seq_length,
-                micro_batch_size=config.train.micro_batch_size,
-                global_batch_size=config.train.global_batch_size,
-                num_workers=2,
-            )
-
-    elif dataset_type == "vlm-hf":
-        config.dataset = HFDatasetConversationProvider(
-            seq_length=resolved_seq_length,
-            hf_processor_path=None,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-            pack_sequences_in_batch=False,
-        )
-
-    elif dataset_type == "vlm-preloaded":
-        config.dataset = PreloadedVLMConversationProvider(
-            seq_length=resolved_seq_length,
-            hf_processor_path=None,
-            train_data_path=None,
-            valid_data_path=None,
-            test_data_path=None,
-            dataloader_type="single",
-            num_workers=2,
-        )
-
-    else:
-        raise ValueError(f"Unknown dataset type: '{dataset_type}'. Choose from: {', '.join(DATASET_TYPES)}")
-
-    if seq_length is not None and hasattr(config, "model") and config.model is not None:
-        config.model.seq_length = seq_length
-
-    return config
-
-
-def infer_mode_from_dataset(dataset_type: str) -> str:
-    """Infer training mode from the dataset type prefix."""
-    if dataset_type.startswith("llm-pretrain"):
-        return "pretrain"
-    return "finetune"
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/__init__.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Megatron Bridge Recipe Configurations
-
-This module exposes all recipe configurations from all model families.
-"""
-
-from megatron.bridge.diffusion.recipes.flux.flux import *
-from megatron.bridge.diffusion.recipes.wan.wan import *
-from megatron.bridge.recipes.deepseek import *
-from megatron.bridge.recipes.gemma import *
-from megatron.bridge.recipes.gemma3_vl import *
-from megatron.bridge.recipes.glm import *
-from megatron.bridge.recipes.glm_vl import *
-from megatron.bridge.recipes.gpt import *
-from megatron.bridge.recipes.gpt_oss import *
-from megatron.bridge.recipes.kimi_vl import *
-from megatron.bridge.recipes.llama import *
-from megatron.bridge.recipes.ministral3 import *
-from megatron.bridge.recipes.moonlight import *
-from megatron.bridge.recipes.nemotronh import *
-from megatron.bridge.recipes.olmoe import *
-from megatron.bridge.recipes.qwen import *
-from megatron.bridge.recipes.qwen2_audio import *
-from megatron.bridge.recipes.qwen_vl import *
-
-```
-</file_contents>
-<user_instructions>
-<taskname="Pretraining Tutorial Context"/>
-<task>Write a comprehensive tutorial about pretraining in Megatron-Bridge. Cover the end-to-end workflow (config recipes, dataset wiring, launch methods, core entry points, scaling/perf options, and practical examples), grounded in the selected Megatron-Bridge docs and source files.</task>
-
-<architecture>
-- Documentation layer (`docs/`): conceptual guidance for training configuration, loop controls, optimization, distributed parallelisms, performance tuning, checkpointing, and resiliency.
-- Recipe layer (`src/megatron/bridge/recipes`): reusable `ConfigContainer` defaults (`common.py`) and model-specific pretrain configs (Nemotron 3 Nano/Super).
-- Launch layer (`scripts/training`): generic recipe runner (`run_recipe.py`) plus NeMo-Run/Slurm launch wrappers.
-- Execution layer (`src/megatron/bridge/training`): `pretrain()` orchestration, setup/bootstrap, and GPT forward-step behavior.
-- Example layer (`examples/models/*`): concrete pretraining commands and Slurm job templates for Nemotron 3 and GPT-OSS.
-</architecture>
-
-<selected_context>
-Megatron-Bridge/docs/recipe-usage.md: Central guide for recipe-based pretraining, override patterns (Python/YAML/Hydra-style), and launch methods (`torchrun`, NeMo-Run).
-Megatron-Bridge/docs/training/*.md: Full training docs set covering config container, entry points, training-loop settings, optimizer/scheduler, logging, checkpointing, mixed precision, callbacks, profiling, communication overlap, FSDP, resiliency, and related tuning topics.
-Megatron-Bridge/docs/parallelisms.md: Detailed distributed parallelism reference used by pretraining tutorials (TP/PP/EP/CP/SP and tradeoffs).
-Megatron-Bridge/docs/performance-guide.md and performance-summary.md: Practical performance recommendations and quick reference.
-Megatron-Bridge/docs/models/llm/nemotron3.md: Nemotron 3 Nano pretraining walkthrough and command examples.
-Megatron-Bridge/docs/models/llm/nemotron3-super.md: Nemotron 3 Super pretraining workflow, hardware/parallelism requirements, and command examples.
-Megatron-Bridge/docs/models/llm/nemotronh.md: Nemotron H/Nano v2 model family context and recipe usage patterns relevant to pretraining narrative.
-Megatron-Bridge/scripts/training/run_recipe.py: Generic CLI training entry script; loads recipe, applies dataset override and Hydra-style config overrides, selects step function, dispatches `pretrain`/`finetune`.
-Megatron-Bridge/scripts/training/README.md: User-facing launcher usage and common command patterns for pretraining.
-Megatron-Bridge/scripts/training/launch_with_nemo_run.py and launch_with_sbatch.sh: Multi-node orchestration patterns and practical launch templates.
-Megatron-Bridge/src/megatron/bridge/recipes/common.py: `_pretrain_common()` baseline defaults (train/scheduler/logger/checkpoint/tokenizer/dataset scaffolding).
-Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_nano.py: `nemotron_3_nano_pretrain_config()` model-specific overrides on top of `_pretrain_common`.
-Megatron-Bridge/src/megatron/bridge/recipes/nemotronh/nemotron_3_super.py: `nemotron_3_super_pretrain_config()` model-specific overrides and pretrained bridge references.
-Megatron-Bridge/src/megatron/bridge/recipes/utils/dataset_utils.py: dataset-type mapping and `apply_dataset_override()` logic used by generic scripts.
-Megatron-Bridge/src/megatron/bridge/training/pretrain.py: top-level pretraining orchestration (`runtime_config_update` → `setup` → `train`/eval/test lifecycle).
-Megatron-Bridge/src/megatron/bridge/training/setup.py: initialization path (dist setup, tokenizer/model/optimizer construction, checkpoint load, data iterator setup).
-Megatron-Bridge/src/megatron/bridge/training/gpt_step.py: canonical GPT forward-step batch handling and loss wiring for tutorial code-path explanation.
-Megatron-Bridge/src/megatron/bridge/training/utils/omegaconf_utils.py: conversion/merge/apply utilities for YAML and CLI overrides used in examples.
-Megatron-Bridge/examples/models/nemotron_3/*/pretrain_*.py: concrete Python pretraining entry scripts with override handling and `pretrain(config, forward_step_func)` invocation.
-Megatron-Bridge/examples/models/nemotron_3/*/slurm_pretrain.sh: production-style Slurm launch templates and common runtime env/override patterns.
-Megatron-Bridge/examples/models/gpt_oss/README.md + slurm_pretrain.sh: additional pretraining recipe/launcher example set for a different model family.
-</selected_context>
-
-<relationships>
-- `scripts/training/run_recipe.py` -> `megatron.bridge.recipes.<recipe_fn>()` -> `ConfigContainer` -> `training.pretrain.pretrain()`.
-- `recipes/common.py::_pretrain_common()` provides shared defaults; model recipe functions (e.g., Nemotron 3 Nano/Super) specialize those defaults.
-- `dataset_utils.apply_dataset_override()` mutates recipe config according to `--dataset` mode before final override processing.
-- `training.pretrain.pretrain()` calls `training.setup.setup()` to build runtime state and then drives training/eval flow.
-- `training.setup.setup()` wires tokenizer + model + optimizer + scheduler + checkpoint + dataloaders; then returns objects used by train loop.
-- Example scripts (`examples/models/.../pretrain_*.py`) are minimal wrappers around recipe creation + OmegaConf/Hydra override parsing + `pretrain()` call.
-- Docs map directly to these layers: recipe usage and training docs explain the same fields/functions exercised by scripts and source.
-</relationships>
-
-<ambiguities>
-- Scope-specific: selection is constrained to `Megatron-Bridge` only.
-- Tutorial target audience depth is not explicitly specified (beginner vs advanced), so structure should be broadly accessible while preserving advanced sections (distributed/perf/resiliency).
-</ambiguities>
-</user_instructions>
diff --git a/skills/nemotron-customize/context/mbridge-sft-full.txt b/skills/nemotron-customize/context/mbridge-sft-full.txt
deleted file mode 100644
index 7560a7b5d..000000000
--- a/skills/nemotron-customize/context/mbridge-sft-full.txt
+++ /dev/null
@@ -1,8632 +0,0 @@
-<file_map>
-/Users/mromeijn/src/Megatron-Bridge
-├── docs
-│   ├── training
-│   │   ├── images
-│   │   ├── README.md *
-│   │   └── packed-sequences.md *
-│   ├── images
-│   ├── modelopt
-│   ├── models
-│   │   ├── llm
-│   │   └── vlm
-│   ├── releases
-│   ├── bridge-guide.md *
-│   ├── parallelisms.md *
-│   └── recipe-usage.md *
-├── scripts
-│   ├── training
-│   │   └── run_recipe.py * +
-│   └── performance
-│       ├── configs
-│       │   ├── deepseek
-│       │   ├── gpt_oss
-│       │   ├── kimi
-│       │   ├── llama
-│       │   ├── nemotronh
-│       │   ├── qwen
-│       │   └── qwen_vl
-│       └── utils
-├── skills
-│   ├── mlm-bridge-training
-│   │   └── SKILL.md *
-│   ├── perf-techniques
-│   │   ├── sequence-packing
-│   │   │   └── SKILL.md *
-│   │   ├── cuda-graphs
-│   │   ├── expert-parallel-overlap
-│   │   ├── hybrid-context-parallel
-│   │   ├── megatron-fsdp
-│   │   ├── moe-comm-overlap
-│   │   ├── packed-sequences-long-context
-│   │   ├── parallelism-strategies
-│   │   └── tp-dp-comm-overlap
-│   ├── adding-model-support
-│   ├── code-style
-│   ├── developer-guide
-│   ├── multi-node-slurm
-│   ├── parity-testing
-│   └── resiliency
-├── src
-│   └── megatron
-│       └── bridge
-│           ├── data
-│           │   ├── builders
-│           │   │   └── finetuning_dataset.py * +
-│           │   ├── datasets
-│           │   │   ├── packed_parquet.py * +
-│           │   │   ├── packed_sequence.py * +
-│           │   │   └── sft.py * +
-│           │   ├── ...
-│           ├── recipes
-│           │   ├── utils
-│           │   │   ├── dataset_utils.py * +
-│           │   │   └── finetune_utils.py * +
-│           │   ├── common.py * +
-│           │   ├── ...
-│           ├── training
-│           │   ├── config.py * +
-│           │   ├── finetune.py * +
-│           │   ├── gpt_step.py * +
-│           │   ├── ...
-│           ├── diffusion
-│           │   └── ...
-│           ├── inference
-│           │   └── ...
-│           ├── models
-│           │   └── ...
-│           ├── peft
-│           └── utils
-├── .github
-│   ├── ISSUE_TEMPLATE
-│   ├── actions
-│   │   └── test-template
-│   └── workflows
-│       └── config
-├── .specstory
-├── 3rdparty
-│   └── Megatron-LM
-│       ├── .github
-│       │   ├── ISSUE_TEMPLATE
-│       │   ├── actions
-│       │   │   └── ...
-│       │   ├── scripts
-│       │   └── workflows
-│       │       └── ...
-│       ├── .gitlab
-│       │   ├── scripts
-│       │   └── stages
-│       ├── docker
-│       │   ├── common
-│       │   └── patches
-│       ├── docs
-│       │   ├── advanced
-│       │   ├── api-guide
-│       │   │   └── ...
-│       │   ├── developer
-│       │   ├── discussions
-│       │   │   └── ...
-│       │   ├── get-started
-│       │   ├── images
-│       │   │   └── ...
-│       │   ├── models
-│       │   └── user-guide
-│       │       └── ...
-│       ├── examples
-│       │   ├── academic_paper_scripts
-│       │   │   └── ...
-│       │   ├── bert
-│       │   ├── export
-│       │   │   └── ...
-│       │   ├── gpt3
-│       │   ├── inference
-│       │   │   └── ...
-│       │   ├── llama
-│       │   ├── mamba
-│       │   ├── mimo
-│       │   │   └── ...
-│       │   ├── mixtral
-│       │   ├── multimodal
-│       │   │   └── ...
-│       │   ├── post_training
-│       │   │   └── ...
-│       │   ├── rl
-│       │   │   └── ...
-│       │   └── t5
-│       ├── images
-│       ├── megatron
-│       │   ├── core
-│       │   │   └── ...
-│       │   ├── inference
-│       │   ├── legacy
-│       │   │   └── ...
-│       │   ├── post_training
-│       │   ├── rl
-│       │   │   └── ...
-│       │   └── training
-│       │       └── ...
-│       ├── scripts
-│       ├── tasks
-│       ├── tests
-│       │   ├── functional_tests
-│       │   │   └── ...
-│       │   ├── test_utils
-│       │   │   └── ...
-│       │   └── unit_tests
-│       │       └── ...
-│       └── tools
-│           ├── bert_embedding
-│           └── checkpoint
-├── docker
-│   ├── common
-│   └── patches
-├── examples
-│   ├── conversion
-│   │   ├── adapter
-│   │   └── compare_hf_and_megatron
-│   ├── decentralized_pg
-│   ├── diffusion
-│   │   └── recipes
-│   │       ├── flux
-│   │       │   └── ...
-│   │       └── wan
-│   │           └── ...
-│   ├── distillation
-│   │   └── llama
-│   │       └── conf
-│   ├── evaluation
-│   │   └── utils
-│   ├── inference
-│   │   └── vlm
-│   ├── long_context
-│   ├── models
-│   │   ├── audio_lm
-│   │   │   ├── qwen2_audio
-│   │   │   └── qwen3_asr
-│   │   ├── bailing
-│   │   ├── gpt_oss
-│   │   ├── minimax_m2
-│   │   ├── nemotron_3
-│   │   │   ├── nano
-│   │   │   └── super
-│   │   ├── qwen3_next
-│   │   │   └── conf
-│   │   ├── sarvam
-│   │   └── vlm
-│   │       ├── gemma3_vl
-│   │       ├── glm_45v
-│   │       ├── kimi_k25_vl
-│   │       ├── ministral3
-│   │       ├── nemotron_vl
-│   │       │   └── ...
-│   │       ├── qwen25_omni
-│   │       ├── qwen35_vl
-│   │       ├── qwen3_vl
-│   │       └── qwen_vl
-│   │           └── ...
-│   ├── peft
-│   ├── quantization
-│   │   └── conf
-│   ├── resiliency
-│   │   ├── fault_tolerance
-│   │   └── straggler_detection
-│   └── rl
-├── tests
-│   ├── functional_tests
-│   │   ├── data
-│   │   │   ├── energon
-│   │   │   └── hf_processors
-│   │   ├── diffusion
-│   │   │   ├── flux
-│   │   │   └── wan
-│   │   ├── inference
-│   │   ├── launch_scripts
-│   │   │   ├── active
-│   │   │   └── flaky
-│   │   ├── models
-│   │   │   ├── qwen3_asr
-│   │   │   └── qwen_audio
-│   │   └── test_groups
-│   │       ├── ckpts
-│   │       │   └── ...
-│   │       ├── converter
-│   │       ├── data
-│   │       │   └── ...
-│   │       ├── diffusion
-│   │       │   └── ...
-│   │       ├── models
-│   │       │   └── ...
-│   │       ├── quantization
-│   │       │   └── ...
-│   │       ├── recipes
-│   │       ├── training
-│   │       └── utils
-│   └── unit_tests
-│       ├── data
-│       │   ├── builders
-│       │   ├── datasets
-│       │   ├── energon
-│       │   ├── mimo
-│       │   └── vlm_datasets
-│       ├── diffusion
-│       │   ├── data
-│       │   │   └── ...
-│       │   ├── model
-│       │   │   └── ...
-│       │   └── recipes
-│       │       └── ...
-│       ├── inference
-│       │   └── vlm
-│       ├── models
-│       │   ├── common
-│       │   ├── decorators
-│       │   ├── deepseek
-│       │   ├── gemma
-│       │   ├── gemma_vl
-│       │   ├── glm
-│       │   ├── glm_vl
-│       │   ├── gpt
-│       │   ├── gpt_oss
-│       │   ├── hf_pretrained
-│       │   ├── kimi
-│       │   ├── kimi_vl
-│       │   ├── llama
-│       │   ├── llama_nemotron
-│       │   ├── mamba
-│       │   ├── mimo
-│       │   ├── minimax_m2
-│       │   ├── ministral3
-│       │   ├── mistral
-│       │   ├── nemotron
-│       │   ├── nemotron_vl
-│       │   ├── nemotronh
-│       │   ├── olmoe
-│       │   ├── qwen
-│       │   ├── qwen3_asr
-│       │   │   └── ...
-│       │   ├── qwen_audio
-│       │   ├── qwen_omni
-│       │   │   └── ...
-│       │   ├── qwen_vl
-│       │   │   └── ...
-│       │   └── sarvam
-│       ├── peft
-│       ├── recipes
-│       │   ├── gemma
-│       │   ├── gpt
-│       │   ├── kimi
-│       │   ├── nemotronh
-│       │   ├── qwen
-│       │   ├── qwen_vl
-│       │   │   └── ...
-│       │   └── utils
-│       ├── scripts
-│       │   └── performance
-│       ├── training
-│       │   ├── mimo
-│       │   ├── mlm_compat
-│       │   ├── post_training
-│       │   └── utils
-│       └── utils
-└── tutorials
-    ├── data
-    │   └── dclm
-    ├── recipes
-    │   └── llama
-    │       └── conf
-    └── training
-
-
-(* denotes selected files)
-(+ denotes code-map available)
-Config: directory-only view; depth cap 3; selected files shown.
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/tokenizers/megatron_tokenizer.py
-Imports:
-  - import importlib
-  - import json
-  - import logging
-  - import os
-  - from collections import OrderedDict
-  - from typing import Optional, Union
-  - from megatron.core.tokenizers.base_tokenizer import MegatronTokenizerBase
----
-Classes:
-  - MegatronTokenizer
-    Methods:
-      - L40: def __init__(self) -> None:
-      - L46: def from_pretrained(
-        tokenizer_path: str = None, metadata_path: Optional[Union[str, dict]] = None, **kwargs
-    ) -> MegatronTokenizerBase:
-      - L104: def write_metadata(
-        tokenizer_path: str,
-        tokenizer_library: str,
-        model_type: Optional[str] = None,
-        tokenizer_class: Optional[MegatronTokenizerBase] = None,
-        chat_template: Optional[str] = None,
-        overwrite: Optional[bool] = False,
-        metadata_path: Optional[str] = None,
-    ) -> None:
-
-Functions:
-  - L170: def _get_metadata_path(tokenizer_path: str) -> str:
-  - L188: def _get_tokenizer_model_class(library: str, metadata: dict) -> MegatronTokenizerBase:
-
-Global vars:
-  - TOKENIZER_MAPPING_NAMES
-  - TEXT_LIBRARIES
-  - VISION_LIBRARIES
-  - logger
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/callbacks.py
-Imports:
-  - import logging
-  - from collections.abc import Callable
-  - from dataclasses import dataclass, field
-  - from typing import TYPE_CHECKING
-  - import torch
-  - from megatron.core.optimizer import MegatronOptimizer
-  - from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
-  - from megatron.core.transformer import MegatronModule
-  - from megatron.bridge.training.state import GlobalState
----
-Classes:
-  - CallbackContext
-    Properties:
-      - state
-      - model
-      - user_state
-      - optimizer
-      - scheduler
-      - loss_dict
-      - grad_norm
-      - skipped_iter
-      - total_loss_dict
-  - Callback
-    Methods:
-      - L145: def on_data_init_start(self, context: CallbackContext) -> None:
-      - L154: def on_train_start(self, context: CallbackContext) -> None:
-      - L158: def on_train_step_start(self, context: CallbackContext) -> None:
-      - L162: def on_train_step_end(self, context: CallbackContext) -> None:
-      - L166: def on_train_end(self, context: CallbackContext) -> None:
-      - L170: def on_eval_start(self, context: CallbackContext) -> None:
-      - L174: def on_eval_step_start(self, context: CallbackContext) -> None:
-      - L178: def on_eval_step_end(self, context: CallbackContext) -> None:
-      - L182: def on_eval_end(self, context: CallbackContext) -> None:
-      - L186: def on_test_start(self, context: CallbackContext) -> None:
-      - L190: def on_test_step_start(self, context: CallbackContext) -> None:
-      - L194: def on_test_step_end(self, context: CallbackContext) -> None:
-      - L198: def on_test_end(self, context: CallbackContext) -> None:
-      - L202: def on_checkpoint_save(self, context: CallbackContext) -> None:
-  - CallbackManager
-    Methods:
-      - L237: def __init__(self) -> None:
-      - L244: def user_state(self) -> dict:
-      - L248: def add(self, callback: Callback | list[Callback]) -> None:
-      - L273: def register(self, event_name: str, fn: Callable[[CallbackContext], None]) -> None:
-      - L308: def events(self) -> frozenset[str]:
-      - L312: def list_callbacks(self, event_name: str) -> list[Callable[[CallbackContext], None]]:
-      - L328: def has_callbacks(self, event_name: str) -> bool:
-      - L339: def fire(self, event_name: str, context: CallbackContext) -> None:
-
-Functions:
-  - L352: def normalize_callbacks(
-    callbacks: list[Callback] | CallbackManager | None,
-) -> CallbackManager | None:
-  - L376: def should_fire(callback_manager: CallbackManager | None, event_name: str) -> bool:
-
-Global vars:
-  - logger
-  - VALID_EVENTS
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/state.py
-Imports:
-  - import json
-  - import os
-  - import time
-  - import types
-  - from dataclasses import dataclass
-  - from typing import Any, Optional
-  - import torch
-  - from megatron.core.energy_monitor import EnergyMonitor
-  - from megatron.core.timers import Timers
-  - from megatron.core.utils import StragglerDetector
-  - from torch.distributed.checkpoint.stateful import Stateful
-  - from torch.utils.tensorboard.writer import SummaryWriter
-  - from megatron.core.dist_checkpointing.strategies.torch import get_async_strategy
-  - from megatron.bridge.training.config import ConfigContainer
-  - from megatron.bridge.training.nvrx_straggler import NVRxStragglerDetectionManager
-  - from megatron.bridge.training.tokenizers.tokenizer import build_tokenizer
-  - from megatron.bridge.training.utils.log_utils import safe_serialize
-  - from megatron.bridge.training.utils.sig_utils import DistributedSignalHandler
-  - from megatron.bridge.utils.common_utils import get_rank_safe, get_world_size_safe
-  - import wandb
-  - import mlflow
-  - import comet_ml
-  - import warnings
----
-Classes:
-  - TrainState
-    Methods:
-      - L63: def state_dict(self) -> dict[str, torch.Tensor]:
-      - L85: def load_state_dict(self, state_dict: dict[str, torch.Tensor]) -> None:
-    Properties:
-      - step
-      - consumed_train_samples
-      - skipped_train_samples
-      - consumed_valid_samples
-      - floating_point_operations_so_far
-      - do_train
-      - do_valid
-      - do_test
-  - FaultToleranceState
-    Properties:
-      - ft_state_path
-      - is_persistent_chkpt_loaded
-      - is_async_chkpt_enabled
-      - is_calculating_timeouts
-      - is_setup_section_open
-      - seen_checkpoints_cnt
-      - seen_tr_iters_cnt
-      - curr_eval_iter_idx
-  - GlobalState
-    Methods:
-      - L124: def __init__(self) -> None:
-      - L151: def cfg(self) -> Optional[ConfigContainer]:
-      - L156: def cfg(self, value: Optional[ConfigContainer]) -> None:
-      - L172: def tokenizer(self) -> Any:
-      - L179: def tensorboard_logger(self) -> Optional[SummaryWriter]:
-      - L195: def wandb_logger(self) -> Optional[Any]:
-      - L228: def mlflow_logger(self) -> Optional[Any]:
-      - L255: def _flatten_dict(d: dict[str, Any], parent_key: str = "", sep: str = ".") -> dict[str, Any]:
-      - L294: def comet_logger(self) -> Optional[Any]:
-      - L342: def timers(self) -> Timers:
-      - L352: def train_state(self) -> TrainState:
-      - L359: def train_state(self, value: TrainState) -> None:
-      - L368: def fault_tolerance_state(self) -> FaultToleranceState:
-      - L375: def fault_tolerance_state(self, value: FaultToleranceState) -> None:
-      - L384: def signal_handler(self) -> DistributedSignalHandler:
-      - L391: def straggler_timer(self) -> StragglerDetector:
-      - L397: def initialize_async_checkpoint_worker(self) -> None:
-      - L425: def async_calls_queue(self) -> Optional[Any]:
-      - L430: def nvrx_straggler_manager(self) -> Optional[NVRxStragglerDetectionManager]:
-      - L443: def energy_monitor(self) -> Optional[EnergyMonitor]:
-      - L455: def _set_signal_handler(self) -> None:
-      - L460: def reset_for_restart(self) -> None:
-
-Functions:
-  - L480: def _timers_write_to_wandb(
-    self: Timers,
-    names: list[str],
-    writer: Any,
-    iteration: int,
-    normalizer: float = 1.0,
-    reset: bool = True,
-    barrier: bool = False,
-) -> None:
-  - L501: def _timers_write_to_mlflow(
-    self: Timers,
-    names: list[str],
-    logger: Any,
-    iteration: int,
-    normalizer: float = 1.0,
-    reset: bool = True,
-    barrier: bool = False,
-) -> None:
-  - L527: def _timers_write_to_comet(
-    self: Timers,
-    names: list[str],
-    logger: Any,
-    iteration: int,
-    normalizer: float = 1.0,
-    reset: bool = True,
-    barrier: bool = False,
-) -> None:
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/builders/hf_dataset.py
-Imports:
-  - import glob
-  - import json
-  - import logging
-  - import os
-  - import shutil
-  - from dataclasses import dataclass
-  - from pathlib import Path
-  - from typing import Any, Callable, Optional, Protocol, TypedDict, Union, cast
-  - from datasets import Dataset, DatasetDict, load_dataset
-  - from tqdm import tqdm
-  - from megatron.bridge.data.builders.finetuning_dataset import FinetuningDatasetBuilder
-  - from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-  - from megatron.bridge.data.datasets.sft import get_dataset_root
-  - from megatron.bridge.training.config import FinetuningDatasetConfig
-  - from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
-  - from megatron.bridge.utils.common_utils import print_rank_0
----
-Classes:
-  - ProcessExampleOutput
-    Properties:
-      - input
-      - output
-      - original_answers
-  - ProcessExampleFn
-    Methods:
-      - L58: def __call__(self, example: dict[str, Any], tokenizer: MegatronTokenizer | None = None) -> dict[str, Any]:
-  - HFDatasetConfig
-    Properties:
-      - dataset_name
-      - process_example_fn
-      - dataset_subset
-      - dataset_dict
-      - split
-      - download_mode
-      - val_proportion
-      - split_val_from_train
-      - delete_raw
-      - rewrite
-      - hf_kwargs
-      - hf_filter_lambda
-      - hf_filter_lambda_kwargs
-  - HFDatasetBuilder
-    Methods:
-      - L227: def __init__(
-        self,
-        dataset_name: str,
-        tokenizer,
-        process_example_fn: ProcessExampleFn,
-        dataset_dict: Optional[DatasetDict] = None,
-        dataset_subset: Optional[str] = None,
-        dataset_root: Optional[Union[str, Path]] = None,
-        split=None,
-        seq_length=1024,
-        seed: int = 1234,
-        memmap_workers: int = 1,
-        max_train_samples: Optional[int] = None,
-        packed_sequence_specs: Optional[PackedSequenceSpecs] = None,
-        download_mode: Optional[str] = None,
-        val_proportion: Optional[float] = 0.05,
-        split_val_from_train: bool = True,
-        rewrite: bool = True,
-        delete_raw: bool = False,
-        hf_kwargs: Optional[dict[str, Any]] = None,
-        dataset_kwargs: Optional[dict[str, Any]] = None,
-        hf_filter_lambda: Optional[Callable] = None,
-        hf_filter_lambda_kwargs: Optional[dict[str, Any]] = None,
-        do_validation: bool = True,
-        do_test: bool = True,
-    ) -> None:
-      - L318: def prepare_data(self) -> None:
-      - L347: def _load_dataset(self) -> DatasetDict:
-
-Functions:
-  - L101: def preprocess_and_split_data(
-    dset: DatasetDict,
-    dataset_name: str,
-    dataset_root: Path,
-    tokenizer: MegatronTokenizer,
-    process_example_fn: ProcessExampleFn,
-    split_val_from_train: bool = True,
-    val_proportion: Optional[float] = None,
-    train_aliases: tuple[str] = ("train", "training"),
-    test_aliases: tuple[str] = ("test", "testing"),
-    val_aliases: tuple[str] = ("val", "validation", "valid", "eval"),
-    delete_raw: bool = False,
-    seed: int = 1234,
-    rewrite: bool = False,
-    do_test: bool = True,
-    do_validation: bool = True,
-):
-
-Global vars:
-  - logger
----
-
-
-File: /Users/mromeijn/src/Nemotron/src/nemotron/kit/megatron_stub.py
-Imports:
-  - from dataclasses import dataclass, field
-  - from pathlib import Path
----
-Classes:
-  - DataConfig
-    Properties:
-      - data_path
-      - mock
-      - seq_length
-      - micro_batch_size
-      - global_batch_size
-  - ModelConfig
-    Properties:
-      - name
-      - num_layers
-      - hidden_size
-      - num_attention_heads
-      - ffn_hidden_size
-      - vocab_size
-  - OptimizerConfig
-    Properties:
-      - lr
-      - min_lr
-      - weight_decay
-      - adam_beta1
-      - adam_beta2
-  - TrainingConfig
-    Properties:
-      - max_steps
-      - log_interval
-      - eval_interval
-      - save_interval
-      - fp16
-      - bf16
-  - CheckpointConfig
-    Properties:
-      - dir
-      - save_on_train_end
-      - resume_from
-  - ConfigContainer
-    Properties:
-      - data
-      - model
-      - optimizer
-      - training
-      - checkpoint
----
-
-
-File: /Users/mromeijn/src/Megatron-Bridge/3rdparty/Megatron-LM/megatron/legacy/model/gpt_model.py
-Imports:
-  - import torch
-  - from typing import Optional
-  - from megatron.training import get_args
-  - from megatron.core import tensor_parallel
-  - from megatron.core.utils import deprecate_inference_params
-  - from .enums import AttnMaskType
-  - from .language_model import parallel_lm_logits
-  - from .language_model import get_language_model
-  - from .module import MegatronModule
----
-Classes:
-  - GPTModel
-    Methods:
-      - L48: def __init__(self,
-                 config,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 pre_process=True,
-                 post_process=True):
-      - L74: def set_input_tensor(self, input_tensor):
-      - L78: def forward(self, input_ids, position_ids, attention_mask,
-                labels=None, tokentype_ids=None, inference_context=None, *, inference_params=None):
-      - L98: def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False):
-      - L111: def load_state_dict(self, state_dict, strict=True):
-
-Functions:
-  - L18: def post_language_model_processing(lm_output, labels, logit_weights,
-                                   parallel_output,
-                                   fp16_lm_cross_entropy):
----
-
-</file_map>
-<file_contents>
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/README.md
-```md
-# Training and Customization
-
-This directory contains comprehensive documentation for training and customizing models with Megatron Bridge. Learn how to configure training, optimize performance, and customize training workflows.
-
-## Quick Navigation
-
-### I want to
-
-**🚀 Get started with training**
-→ Start with [Configuration Container Overview](config-container-overview.md) to understand the training setup
-
-**⚙️ Configure training parameters**
-→ See [Training Loop Settings](training-loop-settings.md) and [Optimizer & Scheduler](optimizer-scheduler.md)
-
-**📊 Monitor and profile training**
-→ Check [Logging](logging.md) and [Profiling](profiling.md) guides
-
-**💾 Manage checkpoints**
-→ Read [Checkpointing](checkpointing.md) for saving and resuming training
-
-**⚡ Optimize performance**
-→ Explore [Performance Guide](../performance-guide.md) and [Performance Summary](../performance-summary.md)
-
-**🔧 Customize training**
-→ See [PEFT](peft.md), [Distillation](distillation.md), [Entry Points](entry-points.md), and [Callbacks](callbacks.md)
-
-## Core Training Documentation
-
-### Configuration and Setup
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Configuration Container Overview](config-container-overview.md)** | Central configuration object for all training settings | First time setting up training |
-| **[Entry Points](entry-points.md)** | Training entry points and execution flow | Understanding how training starts |
-| **[Training Loop Settings](training-loop-settings.md)** | Training loop parameters and configuration | Configuring batch sizes, iterations, validation |
-
-### Optimization and Performance
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Optimizer & Scheduler](optimizer-scheduler.md)** | Optimizer and learning rate scheduler configuration | Setting up optimization |
-| **[Mixed Precision](mixed-precision.md)** | Mixed precision training for memory efficiency | Reducing memory usage |
-| **[Communication Overlap](communication-overlap.md)** | Overlapping communication with computation | Optimizing distributed training |
-| **[Hybrid Context Parallel](hybrid-context-parallel.md)** | Hierarchical `a2a+p2p` context parallel guidance | Advanced long-sequence scaling |
-| **[Attention Optimizations](attention-optimizations.md)** | Optimizing attention mechanisms | Improving training speed |
-| **[Activation Recomputation](activation-recomputation.md)** | Gradient checkpointing strategies | Reducing memory footprint |
-| **[CPU Offloading](cpu-offloading.md)** | Offloading to CPU for memory management | Working with limited GPU memory |
-
-### Monitoring and Debugging
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[Logging](logging.md)** | Logging configuration and TensorBoard/WandB integration | Monitoring training progress |
-| **[Profiling](profiling.md)** | Performance profiling and analysis | Identifying bottlenecks |
-| **[Resiliency](resiliency.md)** | Handling failures and recovery | Building robust training pipelines |
-
-### Advanced Features
-
-| Document | Purpose | When to Read |
-|----------|---------|--------------|
-| **[PEFT](peft.md)** | Parameter-Efficient Fine-Tuning (LoRA, etc.) | Fine-tuning with limited resources |
-| **[Packed Sequences](packed-sequences.md)** | Sequence packing for efficiency | Optimizing data loading |
-| **[Megatron FSDP](megatron-fsdp.md)** | Stable overview of Megatron FSDP | Choosing an FSDP path |
-| **[Distillation](distillation.md)** | Knowledge distillation techniques | Transferring knowledge between models |
-| **[Checkpointing](checkpointing.md)** | Checkpoint saving, loading, and resuming | Managing training state |
-| **[Callbacks](callbacks.md)** | Inject custom logic into training loop | Custom logging, metrics, third-party integrations |
-
-## Training Workflow
-
-A typical training workflow involves:
-
-1. **Configure Training** - Set up `ConfigContainer` with model, data, and training parameters
-2. **Prepare Data** - Configure dataset loading and preprocessing
-3. **Set Optimization** - Configure optimizer, scheduler, and mixed precision
-4. **Enable Monitoring** - Set up logging and profiling
-5. **Configure Checkpointing** - Set up checkpoint saving and resuming
-6. **Launch Training** - Start training with configured entry points
-7. **Monitor Progress** - Track metrics via logging and profiling
-8. **Resume if Needed** - Use checkpointing to resume from saved state
-
-## Related Documentation
-
-- **[Main Documentation Index](../index.md)** - Return to main documentation
-- **[Performance Guide](../performance-guide.md)** - Comprehensive performance optimization guide
-- **[Performance Summary](../performance-summary.md)** - Quick performance reference
-- **[Recipe Usage](../recipe-usage.md)** - Using training recipes
-- **[Parallelisms](../parallelisms.md)** - Understanding distributed training strategies
-- **[Bridge Guide](../bridge-guide.md)** - Working with Hugging Face models
-
-## Common Training Scenarios
-
-### 🆕 First-Time Training Setup
-
-1. [Configuration Container Overview](config-container-overview.md) - Understand the configuration system
-2. [Entry Points](entry-points.md) - Learn how to start training
-3. [Training Loop Settings](training-loop-settings.md) - Configure basic training parameters
-4. [Logging](logging.md) - Set up monitoring
-
-### ⚡ Performance Optimization
-
-1. [Performance Guide](../performance-guide.md) - Comprehensive optimization strategies
-2. [Mixed Precision](mixed-precision.md) - Enable mixed precision training
-3. [Communication Overlap](communication-overlap.md) - Optimize distributed training
-4. [Activation Recomputation](activation-recomputation.md) - Reduce memory usage
-5. [Profiling](profiling.md) - Identify bottlenecks
-
-### 💾 Production Training
-
-1. [Checkpointing](checkpointing.md) - Reliable checkpoint management
-2. [Resiliency](resiliency.md) - Handle failures gracefully
-3. [Logging](logging.md) - Comprehensive monitoring
-4. [Profiling](profiling.md) - Performance analysis
-
-### 🔧 Customization
-
-1. [PEFT](peft.md) - Parameter-efficient fine-tuning
-2. [Distillation](distillation.md) - Knowledge distillation
-3. [Entry Points](entry-points.md) - Custom training workflows
-4. [Callbacks](callbacks.md) - Inject custom logic (third-party integrations)
-
----
-
-**Ready to start training?** Begin with [Configuration Container Overview](config-container-overview.md) or return to the [main documentation](../README.md).
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/training/packed-sequences.md
-```md
-# Packed Sequences
-
-Packed sequences are a fine-tuning technique that reduces padding waste by
-concatenating multiple examples into one pack while preserving sequence
-boundaries for attention. In Megatron Bridge, this is primarily a supervised
-fine-tuning and PEFT optimization rather than a general pretraining feature.
-
-This page is the stable overview for what packed sequences are, when to use
-them, and which constraints are durable. For operational setup, code anchors,
-and verification commands, see [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md).
-
-## What It Is
-
-Fine-tuning datasets often contain examples with highly variable lengths. When
-those examples are batched conventionally, many tokens in each batch are just
-padding. Packed sequences reduce that waste by building longer packs from
-multiple examples and carrying boundary metadata into the attention path.
-
-In Bridge today, there are two distinct packing paths plus long-context
-enablement through context parallelism:
-
-| Path | Use case | Key config |
-|---|---|---|
-| Offline packed SFT | Text-only finetuning | `packed_sequence_specs` |
-| VLM in-batch packing | VLM finetuning | `pack_sequences_in_batch=True` |
-| Long-context (CP) | Pretrain / finetune at 16K-128K+ | `context_parallel_size > 1` |
-
-These are related but they are not the same knob. Offline packed SFT and VLM
-in-batch packing solve padding waste; long-context training primarily addresses
-activation memory and communication tradeoffs at larger sequence lengths.
-
-## When to Use It
-
-Packed sequences are a good fit when all of the following are true:
-
-- you are doing SFT, PEFT, or VLM finetuning (all three packing paths are
-  supported; see the path table above)
-- your examples have variable lengths and padding waste is significant
-- you can tolerate the micro-batch constraints of packed training
-
-Packed sequences are usually not the right answer when:
-
-- you are doing standard Megatron-style pretraining, which already concatenates
-  documents during sampling
-- you want long-context training in general, where context parallelism is often
-  the main technique
-- your model family or recipe explicitly opts out of packed-sequence support
-
-## Stable Constraints
-
-The durable constraints for packed sequences in Bridge are:
-
-- packed SFT requires `micro_batch_size == 1`
-- when context parallelism is used, sequence length must satisfy the standard
-  CP divisibility constraints
-- for fine-tuning with CP enabled, per-token loss behavior and reduction
-  settings matter
-- CUDA-graph-friendly packed metadata requires additional padding constraints
-
-Model-family support is not universal. Some families and recipe paths explicitly
-opt out of packed sequences or related packing modes.
-
-## Relationship to Long-Sequence Training
-
-Packed sequences and long-sequence training are often mentioned together because
-both affect sequence layout and memory behavior, but they solve different
-problems:
-
-- packed sequences mainly reduce padding waste in fine-tuning datasets
-- long-sequence training mainly addresses activation memory and communication
-  tradeoffs at larger sequence lengths
-
-For long-sequence training guidance, see:
-
-- `docs/performance-guide.md`
-- `docs/training/hybrid-context-parallel.md`
-
-## Practical Caveats
-
-The most stable caveats to remember are:
-
-1. Packed-sequence support is recipe- and model-family-specific.
-2. Fine-tuning sequence packing should not be assumed to work with every other
-   training feature.
-3. Packed sequences improve efficiency primarily by reducing padding waste, not
-   by replacing long-context parallelism or memory-planning techniques.
-
-## Related Docs
-
-- [docs/training/multi-token-prediction.md](multi-token-prediction.md)
-- [docs/performance-guide.md](../performance-guide.md)
-- [docs/training/hybrid-context-parallel.md](hybrid-context-parallel.md)
-- [skills/perf-techniques/sequence-packing/SKILL.md](../skills/perf-techniques/sequence-packing/SKILL.md)
-- [skills/perf-techniques/sequence-packing/card.yaml](../skills/perf-techniques/sequence-packing/card.yaml)
-- [skills/perf-techniques/packed-sequences-long-context/SKILL.md](../skills/perf-techniques/packed-sequences-long-context/SKILL.md)
-- [skills/perf-techniques/packed-sequences-long-context/card.yaml](../skills/perf-techniques/packed-sequences-long-context/card.yaml)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/parallelisms.md
-```md
-# Parallelisms Guide
-
-Megatron Bridge supports various data-parallel and model-parallel deep learning workload deployment methods, which can be mixed together arbitrarily. These parallelism strategies are configured through model provider classes and leverage Megatron Core's implementation for performance and memory efficiency.
-
-## Data Parallelism
-
-Data Parallelism (DP) replicates the model across multiple GPUs. Data batches are evenly distributed between GPUs and the data-parallel GPUs process them independently. While the computation workload is efficiently distributed across GPUs, inter-GPU communication is required to keep the model replicas consistent between training steps.
-
-### Distributed Data Parallelism
-
-Distributed Data Parallelism (DDP) keeps the model copies consistent by synchronizing parameter gradients across data-parallel GPUs before each parameter update. More specifically, it sums the gradients of all model copies using all-reduce communication collectives.
-
-![Distributed Data Parallelism](images/ddp.gif)
-*Figure: Distributed Data Parallelism synchronizes gradients across multiple GPUs using all-reduce operations.*
-
-### Distributed Optimizer
-
-[Distributed optimizer](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/dist_optimizer.html) is a memory-optimized data-parallel deployment method. It shards the optimizer states and the high-precision master parameters across data-parallel GPUs instead of replicating them. At the parameter optimizer step, each data-parallel GPU updates its shard of parameters. Since each GPU needs its own gradient shard, the distributed optimizer conducts reduce-scatter of the parameter gradients instead of all-reduce of them. Then, the updated parameter shards are all-gathered across data-parallel GPUs. This approach significantly reduces the memory need of large-scale LLM training.
-
-### Enable Data Parallelism
-
-In Megatron Bridge, DDP is the default parallel deployment method. The total number of GPUs corresponds to the size of the DP group, and training an LLM with model parallelism decreases the size of the DP group.
-
-To enable the distributed optimizer, configure the {py:class}`bridge.training.config.OptimizerConfig` and {py:class}`bridge.training.config.DistributedDataParallelConfig`
-
-```python
-from megatron.bridge.training.config import ConfigContainer, DistributedDataParallelConfig, OptimizerConfig
-
-optimizer_config = OptimizerConfig(
-    optimizer="adam",
-    lr=3e-4,
-    weight_decay=0.1,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    use_distributed_optimizer=True,
-    clip_grad=1.0,
-)
-ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True)
-
-config = ConfigContainer(
-    ddp=ddp_config,
-    optimizer=optimizer_config,
-    # ... other config parameters
-)
-```
-
-For more optimizer options, refer to the {py:class}`bridge.training.config.OptimizerConfig` API documentation.
-
-## Model Parallelism
-
-Model Parallelism (MP) is a distributed model deployment method that partitions the model parameters across GPUs to reduce the need for per-GPU memory. Megatron Bridge supports various model-parallel methods through Megatron Core, which can be mixed to maximize LLM training performance.
-
-### Tensor Parallelism
-
-Tensor Parallelism (TP) is a model-parallel partitioning method that distributes the parameter tensor of an individual layer across GPUs. In addition to reducing model state memory usage, it also saves activation memory as the per-GPU tensor sizes shrink. However, the reduced per-GPU tensor size increases CPU overhead due to smaller per-GPU kernel workloads.
-
-![Tensor Parallelism Overview](images/tp1.png)
-*Figure 1: Tensor Parallelism distributes individual layer parameters across multiple GPUs.*
-
-![Tensor Parallelism Implementation](images/tp2.png)
-*Figure 2: Detailed view of how tensor parallelism splits weight matrices and synchronizes computations.*
-
-#### Enable Tensor Parallelism
-
-To enable TP in Megatron Bridge, configure the `tensor_model_parallel_size` parameter in your model provider. This parameter determines the number of GPUs among which the model's tensors are partitioned.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure model with tensor parallelism
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,  # Enable TP across 2 GPUs
-    # ... other model parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Implement Tensor Parallelism
-
-Megatron Bridge integrates TP through the implementation from Megatron Core. For detailed API usage and additional configurations, consult the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.tensor_parallel.html).
-
-### Pipeline Parallelism
-
-Pipeline Parallelism (PP) is a technique that assigns consecutive layers or segments of a neural network to different GPUs. This division allows each GPU to process different stages of the network sequentially.
-
-![Pipeline Parallelism](images/pp.gif)
-*Figure: Pipeline Parallelism distributes consecutive layers across multiple GPUs, processing batches in a pipeline fashion.*
-
-#### Enable Pipeline Parallelism
-
-To utilize Pipeline Parallelism in Megatron Bridge, set the `pipeline_model_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs among which the model's layers are distributed.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-
-# Configure model with pipeline parallelism
-model_config = GPTModelProvider(
-    pipeline_model_parallel_size=4,  # Distribute layers across 4 GPUs
-    # ... other model parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Interleaved Pipeline Parallel Schedule
-
-To minimize the pipeline bubble, the computation on each GPU can be divided into multiple subsets of layers (referred to as model chunks), rather than a single contiguous block. Enable this by setting `virtual_pipeline_model_parallel_size`:
-
-```python
-model_config = GPTModelProvider(
-    pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=2,  # 2 model chunks per pipeline stage
-    # ... other model parameters
-)
-```
-
-For more insights into this approach, see the detailed blog: [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/#pipeline_parallelism).
-
-#### Implement Pipeline Parallelism
-
-The Megatron Bridge implementation of PP leverages functionalities from Megatron Core. For more detailed API usage and configurations related to PP, visit the [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/apidocs/core/core.pipeline_parallel.html).
-
-### Expert Parallelism and Mixture of Experts (MoE)
-
-Expert Parallelism (EP) is a type of model parallelism that distributes experts of a Mixture of Experts (MoE) model across GPUs. Unlike other model-parallel techniques, EP is applied to only the expert layers and does not impact the parallel mapping of the rest of the layers.
-
-MoE is a machine learning technique where multiple specialized models (experts, usually multi-layer perceptrons) are combined to solve a complex task. Each expert focuses on a specific subtask or domain, while a gating network dynamically activates the most appropriate expert based on the current input.
-
-![Expert Parallelism](images/ep.png)
-*Figure: Expert Parallelism distributes MoE experts across multiple GPUs while keeping other layers replicated.*
-
-#### Basic MoE Configuration
-
-To enable MoE in Megatron Bridge, configure the basic MoE parameters in your model provider:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure basic MoE model
-model_config = GPTModelProvider(
-    num_moe_experts=8,           # Number of experts in the MoE module
-    moe_router_topk=2,           # Number of experts activated per token
-    moe_ffn_hidden_size=8192,    # Hidden size for expert FFN layers
-    # ... other model parameters
-)
-```
-
-#### Enable Expert Parallelism
-
-To enable EP, set `expert_model_parallel_size` in your model configuration. For example, if the model has eight experts (`num_moe_experts=8`), then setting `expert_model_parallel_size=4` results in each GPU processing two experts. The number of experts should be divisible by the expert parallel size.
-
-```python
-# Configure MoE model with expert parallelism
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,  # Distribute 8 experts across 4 GPUs (2 experts per GPU)
-    # ... other model parameters
-)
-```
-
-#### Enable Expert Tensor Parallelism
-
-To enable Expert Tensor Parallelism (ETP), set `expert_tensor_parallel_size` in your model configuration:
-
-```python
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    expert_tensor_parallel_size=2,  # Apply tensor parallelism within each expert
-    # ... other model parameters
-)
-```
-
-#### Advanced MoE Features
-
-Megatron Bridge provides several advanced optimizations for MoE models to improve performance on modern GPU architectures.
-
-##### DeepEP and HybridEP Optimizations
-
-DeepEP and HybridEP are high-performance MoE token dispatchers that improve throughput and efficiency on specific GPU architectures:
-
-- **DeepEP**: Optimized for Ampere, Hopper, B200, and B300 GPUs
-- **HybridEP**: Optimized for GB200, GB300 with NVL72, and Ampere, Hopper, B200, B300 GPUs
-
-These dispatchers replace the standard token routing mechanism with an optimized "flex" dispatcher that provides better performance for MoE workloads.
-
-**Enable DeepEP:**
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    # ... other model parameters
-)
-
-# Apply DeepEP optimization
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep")
-```
-
-**Enable HybridEP:**
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    expert_model_parallel_size=4,
-    # ... other model parameters
-)
-
-# Apply HybridEP optimization
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="hybridep")
-```
-
-**GPU Architecture Requirements:**
-
-- **DeepEP**: Ampere (SM 8.x), Hopper (SM 9.x), B200, B300
-- **HybridEP**: GB200, GB300 with NVL72, Ampere (SM 8.x), Hopper (SM 9.x), B200, B300
-
-The system automatically validates GPU compatibility and issues warnings if the dispatcher is not supported on the current hardware.
-
-##### Token Dropping for Load Balancing
-
-Token dropping improves MoE performance by balancing work across experts through capacity factors. This feature allows the model to drop tokens when experts are overloaded, preventing stragglers and improving overall throughput.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop
-
-model_config = GPTModelProvider(
-    num_moe_experts=8,
-    moe_router_topk=2,
-    moe_token_dispatcher_type="alltoall",  # Required for token dropping
-    moe_router_load_balancing_type="aux_loss",  # Required load balancing type
-    # ... other model parameters
-)
-
-# Apply token dropping with capacity factor
-apply_moe_token_drop(
-    model_config,
-    moe_expert_capacity_factor=1.0,  # Capacity multiplier per expert
-    moe_pad_expert_input_to_capacity=True,  # Pad inputs to capacity length
-)
-```
-
-**Configuration Parameters:**
-
-- `moe_expert_capacity_factor`: Controls the maximum number of tokens each expert can process. A factor of 1.0 means each expert can handle exactly its proportional share of tokens. Lower values (e.g., 0.8) drop more tokens but improve load balancing.
-- `moe_pad_expert_input_to_capacity`: When enabled, pads expert inputs to the capacity length for consistent batch sizes.
-
-**Requirements:**
-
-- Token dispatcher must be `alltoall` or `alltoall_seq`
-- Load balancing type must be `aux_loss`, `seq_aux_loss`, or `none`
-
-**Trade-offs:**
-
-Token dropping can improve training throughput by 10-30% in imbalanced MoE models, but may affect convergence if too aggressive. Start with a capacity factor of 1.0 and gradually reduce if needed.
-
-#### Complete MoE Configuration Example
-
-Here's a complete example showing how to configure an MoE model with advanced optimizations:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend
-from megatron.bridge.training.utils.moe_token_drop import apply_moe_token_drop
-
-# Configure MoE model with expert parallelism
-model_config = GPTModelProvider(
-    num_layers=32,
-    hidden_size=4096,
-    num_attention_heads=32,
-    
-    # MoE configuration
-    num_moe_experts=8,                    # 8 experts total
-    moe_router_topk=2,                    # Activate 2 experts per token
-    moe_ffn_hidden_size=8192,            # Expert FFN hidden dimension
-    moe_token_dispatcher_type="alltoall", # Token dispatcher type
-    moe_router_load_balancing_type="aux_loss",  # Load balancing
-    
-    # Expert parallelism
-    expert_model_parallel_size=4,         # Distribute experts across 4 GPUs
-    expert_tensor_parallel_size=2,        # Apply TP within each expert
-    
-    # ... other model parameters
-)
-
-# Apply DeepEP optimization (for Ampere/Hopper GPUs)
-apply_flex_dispatcher_backend(model_config, moe_flex_dispatcher_backend="deepep")
-
-# Apply token dropping for load balancing
-apply_moe_token_drop(
-    model_config,
-    moe_expert_capacity_factor=1.0,
-    moe_pad_expert_input_to_capacity=True,
-)
-
-config = ConfigContainer(
-    model=model_config,
-    # ... other config parameters
-)
-```
-
-#### Expert Parallelism Implementation
-
-The Megatron Bridge implementation of EP uses functionality from Megatron Core. Please consult the [Megatron Core MoE layer](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/moe_layer.py#L42) for more MoE implementation details.
-
-## Activation Partitioning
-
-In LLM training, a large memory space is needed to store the input activations of the network layers. Megatron Bridge provides effective activation distribution methods through Megatron Core, which is critical in training LLMs with large sequence lengths or large per-GPU micro-batch sizes.
-
-### Sequence Parallelism
-
-Sequence Parallelism (SP) extends tensor-level model parallelism by distributing computing load and activation memory across multiple GPUs along the sequence dimension of transformer layers. This method is particularly useful for portions of the layer that have previously not been parallelized, enhancing overall model performance and efficiency.
-
-![Sequence Parallelism](images/sp.png)
-*Figure: Sequence Parallelism distributes the sequence dimension across multiple GPUs, reducing activation memory.*
-
-#### Enable Sequence Parallelism
-
-To utilize SP in Megatron Bridge, set the `sequence_parallel` parameter to `True` in your model configuration. Note that this feature is effective only when the tensor parallel size (`tensor_model_parallel_size`) is greater than `1`.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure model with sequence parallelism
-model_config = GPTModelProvider(
-    tensor_model_parallel_size=2,  # Required for sequence parallelism
-    sequence_parallel=True,        # Enable sequence parallelism
-    # ... other model parameters
-)
-```
-
-#### Implement Sequence Parallelism
-
-The Megatron Bridge implementation of SP utilizes functionality from Megatron Core. For an in-depth look at how Sequence Parallelism is integrated into the Megatron Core architecture, you can examine the source code: [Megatron-LM Sequence Parallel Source Code](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/layers.py).
-
-### Context Parallelism
-
-Context Parallelism (CP) is a method for parallelizing the processing of neural network activations across multiple GPUs by partitioning the input tensors along the sequence dimension. Unlike Sequence Parallelism (SP) that partitions the activations of specific layers, CP divides the activations of all layers.
-
-CP is critical for training long context models, as it allows the model to handle longer sequences by distributing the sequence activations across multiple GPUs. This method reduces the memory footprint and computational cost of processing long sequences.
-
-#### Enable Context Parallelism
-
-To activate CP in Megatron Bridge, set the `context_parallel_size` parameter in your model configuration. This parameter specifies the number of GPUs across which the model's sequence activations are distributed.
-
-```python
-from megatron.bridge.models import GPTModelProvider
-
-# Configure model with context parallelism
-model_config = GPTModelProvider(
-    context_parallel_size=2,  # Distribute sequence across 2 GPUs
-    # ... other model parameters
-)
-```
-
-For long context training scenarios, context parallelism is particularly effective and essential for handling sequences that exceed the memory capacity of individual GPUs.
-
-#### Implement Context Parallelism
-
-Megatron Bridge leverages functionalities from both Megatron Core and Transformer Engine to implement CP efficiently. During forward propagation, each GPU handles a segment of the sequence, storing only the necessary Key and Value (KV) pairs. In the backward pass, these KV pairs are reassembled across GPUs using advanced communication schemes like all-gather and reduce-scatter transformed into point-to-point communications in a ring topology. This method reduces the memory footprint significantly while maintaining computational efficiency.
-
-For more detailed technical information and implementation details, visit:
-- [Megatron Core Context Parallelism Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/user-guide/features/context_parallel.html)
-- [Megatron Core wrappers for Transformer Engine](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/custom_layers/transformer_engine.py)
-- [Transformer Engine attention modules](https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py)
-
-## Combined Parallelism Example
-
-Megatron Bridge allows you to combine multiple parallelism strategies for optimal performance and memory efficiency:
-
-```python
-from megatron.bridge.models import GPTModelProvider
-from megatron.bridge.training.config import ConfigContainer, OptimizerConfig
-
-# Configure model with multiple parallelism strategies
-model_config = GPTModelProvider(
-    # Model parallelism
-    tensor_model_parallel_size=2,      # 2-way tensor parallelism
-    pipeline_model_parallel_size=4,    # 4-way pipeline parallelism
-    virtual_pipeline_model_parallel_size=2,  # Interleaved pipeline
-    
-    # Activation partitioning
-    sequence_parallel=True,            # Enable sequence parallelism (requires TP > 1)
-    context_parallel_size=2,           # 2-way context parallelism
-    
-    # Expert parallelism (for MoE models)
-    num_moe_experts=8,                 # 8 experts
-    expert_model_parallel_size=4,      # Distribute experts across 4 GPUs
-    
-    # ... other model parameters
-)
-
-# Configure distributed optimizer
-optimizer_config = OptimizerConfig(
-    optimizer="adam",
-    use_distributed_optimizer=True,    # Enable distributed optimizer
-    # ... other optimizer parameters
-)
-
-config = ConfigContainer(
-    model=model_config,
-    optimizer=optimizer_config,
-    # ... other config parameters
-)
-```
-
-## Data Parallel Size Calculation
-
-The data parallel size is automatically calculated based on the total world size and model parallelism settings:
-
-```
-data_parallel_size = world_size / (tensor_model_parallel_size × pipeline_model_parallel_size × context_parallel_size)
-```
-
-For example, with 32 GPUs total and the configuration above:
-- `tensor_model_parallel_size = 2`
-- `pipeline_model_parallel_size = 4` 
-- `context_parallel_size = 2`
-- `data_parallel_size = 32 / (2 × 4 × 2) = 2`
-
-## Strategy Selection Guide
-
-Choosing the right combination depends on model size, hardware topology,
-and sequence length.
-
-### Dense Models by Size
-
-| Model size | GPUs | Recommended starting point |
-|---|---|---|
-| < 1B | 1-8 | DP only |
-| 1-10B | 8-16 | TP=2-4 + DP |
-| 10-70B | 16-64 | TP=4-8 + PP=2-4 + DP |
-| 70-175B | 64-256 | TP=8 + PP=4-8 + DP |
-| 175-500B | 256-1024 | TP=8 + PP=8-16 + CP=2 + DP |
-
-### MoE Models
-
-MoE models differ fundamentally from dense models: only a fraction of
-parameters are active per token, so TP can often stay at 1 or 2. EP is
-the primary scaling dimension.
-
-| Total / active params | Typical layout |
-|---|---|
-| < 20B | EP only (TP=1, PP=1) |
-| 20-100B | TP=1-2 + PP=2-4 + EP=8-16 |
-| 100-500B | TP=2-4 + PP=8-16 + EP=8-32 |
-| 500B+ | TP=2 + PP=16 + EP=32-64 |
-
-### By Hardware Topology
-
-- **Single node with NVLink**: maximize TP within the node (up to TP=8).
-- **Multiple nodes with InfiniBand**: keep TP within a node, use PP across nodes.
-- **Limited network (Ethernet)**: minimize TP, prefer PP for cross-node scaling.
-
-### By Sequence Length
-
-| Sequence length | Recommendation |
-|---|---|
-| < 2K | standard TP + PP + DP |
-| 2K-8K | add SP (`sequence_parallel=True`) |
-| 8K-32K | add CP=2 |
-| 32K+ | add CP=4-8, consider hierarchical CP |
-
-For operational details on configuring combined parallelism, troubleshooting
-layouts, and memory estimation, see the
-[parallelism strategies skill](skills/perf-techniques/parallelism-strategies/SKILL.md).
-
-## Configuration Guidelines
-
-### Memory Optimization
-- Use **distributed optimizer** to reduce optimizer state memory
-- Enable **sequence parallelism** when using tensor parallelism to reduce activation memory
-- Use **context parallelism** for long sequence training
-- Consider **pipeline parallelism** for very large models that don't fit on a single GPU
-
-### Performance Optimization
-- **Tensor parallelism** works best within a single node (high bandwidth)
-- **Pipeline parallelism** can work across nodes but requires careful batch size tuning
-- **Context parallelism** is essential for long context scenarios
-- **Expert parallelism** is specific to MoE models and should match the number of experts
-- **DeepEP/HybridEP** provide optimized MoE token dispatching on supported GPU architectures
-
-### Compatibility
-- **Sequence parallelism** requires `tensor_model_parallel_size > 1`
-- **Expert parallelism** requires MoE models (`num_moe_experts > 0`)
-- **DeepEP** requires Ampere, Hopper, B200, or B300 GPUs
-- **HybridEP** requires GB200, GB300 with NVL72, or Ampere, Hopper, B200, B300 GPUs
-- **Token dropping** requires `alltoall` or `alltoall_seq` token dispatcher
-- All parallelism strategies can be combined, but total parallelism must divide evenly into the world size
-
-## Related Artifacts
-
-- **Operational skill**: [skills/perf-techniques/parallelism-strategies/SKILL.md](skills/perf-techniques/parallelism-strategies/SKILL.md) — enablement, pitfalls, memory estimation, verification
-- **Knowledge card**: [skills/perf-techniques/parallelism-strategies/card.yaml](skills/perf-techniques/parallelism-strategies/card.yaml) — structured metadata and validation status
-
-## Resources
-
-- [Megatron Core Developer Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/)
-- [Scaling Language Model Training](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/)
-- [Megatron-LM Repository](https://github.com/NVIDIA/Megatron-LM)
-- [Transformer Engine](https://github.com/NVIDIA/TransformerEngine)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/bridge-guide.md
-```md
-# Get Started with 🤗 Hugging Face Conversion
-
-Megatron Bridge provides seamless bidirectional conversion between 🤗 Hugging Face Transformers and Megatron model definitions. This guide covers the main APIs for loading models, checking compatibility, and converting between formats.
-
-## Design and Goals
-
-- Single high-level entry point: `AutoBridge` detects HF model architectures and dispatches to the correct bridge.
-- Bidirectional conversion: Import HF → Megatron for training; export Megatron → HF for deployment.
-- Parallelism-aware: Handles TP/PP/VPP/CP/EP/ETP distributions during conversion.
-- Streaming and memory efficiency: per-parameter streaming using safetensors.
-- Provider pattern: Configure Megatron-Core `TransformerConfig`-compatible attributes before instantiation via `to_megatron_provider()`.
-- Convenience workflows: `import_ckpt` and `export_ckpt` provide one-call HF↔Megatron checkpoint flows.
-
-See the repository `README.md` for installation, supported models, and project highlights.
-
-## Loading a 🤗 Hugging Face Model into Megatron
-
-The easiest way to load a 🤗 Hugging Face model is using `AutoBridge.from_hf_pretrained()`, which automatically detects the model architecture and selects the appropriate bridge for conversion. You can then use `AutoBridge.to_megatron_model()` to initialize the Megatron model from the 🤗 Hugging Face configuration and load 🤗 Hugging Face weights at the same time.
-
-### Accessing Gated 🤗 Hugging Face Models
-
-Some models in Megatron Bridge require access to gated repositories on Hugging Face. These are models that require explicit permission from the model authors before you can download or use them.
-
-If you encounter an error like this when trying to use a model:
-
-```
-OSError: You are trying to access a gated repo.
-Make sure to have access to it at <URL>
-```
-
-Follow these steps to resolve the issue:
-
-1. **Request access**: Visit the URL provided in the error message and request access to the gated model
-2. **Generate a token**: Create a Hugging Face access token by following [this tutorial](https://huggingface.co/docs/hub/en/security-tokens#how-to-manage-user-access-tokens)
-3. **Set the environment variable**: Export your token in your environment:
-
-```bash
-export HF_TOKEN=<your_access_token>
-```
-
-### Basic Usage
-
-```python
-from megatron.bridge import AutoBridge
-
-# Load a supported model automatically
-bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B")
-
-# Create a provider, configure before instantiation, then build the model
-provider = bridge.to_megatron_provider()
-provider.tensor_model_parallel_size = 1
-provider.pipeline_model_parallel_size = 1
-provider.finalize()
-megatron_model = provider.provide_distributed_model(wrap_with_ddp=False)
-```
-
-### Advanced Loading Options
-You can also load models with specific settings such as precision, device placement, or by enabling trust in remote code:
-
-```python
-import torch
-from megatron.bridge import AutoBridge
-
-# Load with specific settings
-bridge = AutoBridge.from_hf_pretrained(
-    "meta-llama/Llama-2-7b-hf",
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True,
-)
-
-# Load from local path
-bridge = AutoBridge.from_hf_pretrained("/path/to/local/hf_model")
-```
-
-### Using Model Providers
-
-For more control over model configuration, use the provider pattern. The provider lets you configure any `TransformerConfig` attribute:
-
-```python
-from megatron.bridge import AutoBridge
-
-# Load a supported model automatically
-bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B")
-
-# Get a model provider (lazy loading)
-provider = bridge.to_megatron_provider()
-
-# Configure parallelism (multi-GPU requires torchrun or srun)
-provider.tensor_model_parallel_size = 8
-provider.pipeline_model_parallel_size = 2
-
-# Configure fusions
-provider.bias_activation_fusion = True
-provider.bias_dropout_fusion = True
-
-# Finalize the provider to run validation checks and complete initialization
-provider.finalize()
-
-# Create the model with all configurations applied
-model = provider.provide_distributed_model(wrap_with_ddp=False)
-```
-
-The provider pattern is especially useful when you need to:
-- Override default model parameters
-- Configure advanced features like MoE, activation recomputation, or mixed precision
-- Set up distributed training parameters
-
-## Check Supported Models
-
-Before loading a model, you can check if it's supported by Megatron Bridge.
-
-You can list all supported 🤗 Hugging Face model architectures as follows:
-
-```python
-from megatron.bridge import AutoBridge
-
-# Get a list of all supported model architectures
-supported_models = AutoBridge.list_supported_models()
-
-print(f"Found {len(supported_models)} supported models:")
-for i, model in enumerate(supported_models, 1):
-    print(f"  {i:2d}. {model}")
-```
-
-Alternatively, check if a specific model is supported:
-
-```python
-from megatron.bridge import AutoBridge
-
-if AutoBridge.can_handle("meta-llama/Llama-3.2-1B"):
-    print("✅ Model is supported!")
-    bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B")
-else:
-    print("❌ Model requires a custom bridge implementation")
-```
-
-## Converting Back to 🤗 Hugging Face
-
-After training or modifying a Megatron model, you can convert it back to 🤗Hugging Face format for deployment or sharing. The bridge provides several methods for this conversion depending on your needs.
-
-To save the complete model including configuration, tokenizer, and weights:
-
-```python
-# Save the complete model (config, tokenizer, weights)
-bridge.save_hf_pretrained(megatron_model, "./my-fine-tuned-llama")
-
-# The saved model can be loaded with 🤗 Hugging Face
-from transformers import AutoModelForCausalLM
-hf_model = AutoModelForCausalLM.from_pretrained("./my-fine-tuned-llama")
-```
-
-You can save the model weights (safetensors):
-
-```python
-# Save just the model weights (faster, smaller)
-bridge.save_hf_weights(megatron_model, "./model_weights")
-
-# Save without progress bar (useful in scripts)
-bridge.save_hf_weights(megatron_model, "./weights", show_progress=False)
-```
-
-You can also stream weights without saving to disk during conversion for on-the-fly use in RL frameworks, for example:
-
-```python
-# Stream weights during conversion (memory efficient)
-for name, weight in bridge.export_hf_weights(megatron_model):
-    print(f"Exporting {name}: {weight.shape}")
-
-for name, weight in bridge.export_hf_weights(megatron_model, cpu=True):
-    print(f"Exported {name}: {tuple(weight.shape)}")
-```
-
-## Common Patterns and Best Practices
-When working with Megatron Bridge, there are several patterns that will help you use the API effectively and avoid common pitfalls.
-
-### 1. Always Use High-Level APIs
-Always prefer high-level APIs like `AutoBridge` for automatic model detection. Avoid direct bridge usage unless you know the specific type required:
-
-```python
-# ✅ Preferred: Use AutoBridge for automatic detection
-bridge = AutoBridge.from_hf_pretrained("any-supported-model")
-
-# ❌ Avoid: Direct bridge usage unless you know the specific type
-```
-
-### 2. Configure Before Creating Models
-When using the provider pattern, always configure parallelism and other settings before creating the model. Creating the model first uses default settings that may not be optimal:
-
-```python
-# ✅ Correct: Configure provider before creating model
-provider = bridge.to_megatron_provider()
-provider.tensor_model_parallel_size = 8
-provider.finalize()
-model = provider.provide_distributed_model(wrap_with_ddp=False)
-
-# ❌ Avoid: Creating model before configuring parallelism
-model = bridge.to_megatron_model()  # Uses default settings
-```
-
-### 3. Leverage the Parameter Streaming API
-You can stream converted weights from Megatron to HF without saving to disk:
-
-```python
-# ✅ Use streaming for large models
-for name, weight in bridge.export_hf_weights(model, cpu=True):
-    process_weight(name, weight)
-```
-
-### 4. Use `from_hf_pretrained` for Export Workflows
-
-When exporting Megatron checkpoints back to 🤗 Hugging Face format, always use `from_hf_pretrained()` instead of `from_hf_config()`. The `from_hf_config()` method does not load the tokenizer and other artifacts required for saving a complete 🤗 Hugging Face checkpoint:
-
-```python
-from megatron.bridge import AutoBridge
-
-# ✅ Correct: Use from_hf_pretrained for export workflows
-bridge = AutoBridge.from_hf_pretrained("meta-llama/Llama-3.2-1B")
-bridge.export_ckpt("./megatron_checkpoints/llama32_1b", "./hf_exports/llama32_1b")
-
-# ❌ Avoid: from_hf_config lacks artifacts needed for saving
-# config = AutoConfig.from_pretrained("meta-llama/Llama-3.2-1B")
-# bridge = AutoBridge.from_hf_config(config)  # Missing tokenizer, etc.
-# bridge.export_ckpt(...)  # Will fail!
-```
-
-The `from_hf_config()` method is only suitable for architecture exploration and introspection (e.g., inspecting `transformer_config`), not for checkpoint conversion workflows.
-
-For more examples and advanced usage patterns, see the `examples/conversion/` directory in the repository.
-
-## Convenience Workflows (Commands)
-
-These examples can be run directly as shell commands.
-
-### HF → Megatron checkpoint import (one call)
-
-```bash
-huggingface-cli login --token <your token>
-python -c "from megatron.bridge import AutoBridge; AutoBridge.import_ckpt('meta-llama/Llama-3.2-1B','./megatron_checkpoints/llama32_1b')"
-```
-
-### Megatron → HF export (one call)
-
-```bash
-python -c "from megatron.bridge import AutoBridge; b=AutoBridge.from_hf_pretrained('meta-llama/Llama-3.2-1B'); b.export_ckpt('./megatron_checkpoints/llama32_1b','./hf_exports/llama32_1b')"
-```
-
-### Create Megatron models and run locally
-
-```bash
-python - << 'PY'
-from megatron.bridge import AutoBridge
-
-bridge = AutoBridge.from_hf_pretrained('meta-llama/Llama-3.2-1B')
-provider = bridge.to_megatron_provider()
-provider.tensor_model_parallel_size = 1
-provider.pipeline_model_parallel_size = 1
-provider.finalize()
-model = provider.provide_distributed_model(wrap_with_ddp=False)
-
-# Export to HF folder
-bridge.save_hf_pretrained(model, './hf_exports/llama32_1b')
-PY
-```
-
-### Launch with multiple GPUs (example)
-
-```bash
-torchrun --nproc-per-node=2 -m examples.conversion.generate_from_hf
-```
-
-## AutoBridge API Reference
-
-Latest public APIs and signatures (see {doc}`apidocs/bridge/bridge.models.conversion.auto_bridge`):
-
-```python
-from megatron.bridge import AutoBridge
-
-# Creation and capability
-AutoBridge.from_hf_pretrained(path: str | Path, **kwargs) -> AutoBridge
-AutoBridge.from_hf_config(config: PretrainedConfig) -> AutoBridge
-AutoBridge.can_handle(path: str | Path, trust_remote_code: bool = False) -> bool
-AutoBridge.list_supported_models() -> list[str]
-AutoBridge.supports(config: Any) -> bool
-
-# Provider/model construction
-AutoBridge.to_megatron_provider(load_weights: bool = True, hf_path: str | Path | None = None) -> GPTModelProvider
-AutoBridge.to_megatron_model(load_weights: bool = True, hf_path: str | Path | None = None, **kwargs) -> list[MegatronModule]
-
-# HF → Megatron weights
-AutoBridge.load_hf_weights(model: list[MegatronModule], hf_path: str | Path | None = None) -> None
-
-# Megatron → HF conversion
-AutoBridge.export_hf_weights(model: list[MegatronModule], cpu: bool = False, show_progress: bool = True, conversion_tasks: Optional[list[WeightConversionTask]] = None) -> Iterable[HFWeightTuple]
-AutoBridge.save_hf_pretrained(model: list[MegatronModule], path: str | Path, show_progress: bool = True) -> None
-AutoBridge.save_hf_weights(model: list[MegatronModule], path: str | Path, show_progress: bool = True) -> None
-
-# Megatron native checkpoints
-AutoBridge.save_megatron_model(model: list[MegatronModule], path: str | Path) -> None
-AutoBridge.load_megatron_model(path: str | Path, **kwargs) -> list[MegatronModule]
-
-# One-call workflows
-AutoBridge.import_ckpt(hf_model_id: str | Path, megatron_path: str | Path, **kwargs) -> None  # HF → Megatron ckpt
-AutoBridge.export_ckpt(megatron_path: str | Path, hf_path: str | Path, show_progress: bool = True) -> None  # Megatron → HF
-
-# Config extraction
-AutoBridge.transformer_config -> TransformerConfig
-AutoBridge.mla_transformer_config -> MLATransformerConfig
-
-# Introspection / planning
-AutoBridge.get_conversion_tasks(megatron_model: MegatronModule | list[MegatronModule], hf_path: str | Path | None = None) -> list[WeightConversionTask]
-```
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/docs/recipe-usage.md
-```md
-# Using Recipes
-
-Megatron Bridge provides production-ready training recipes for several popular models. You can find an overview of supported recipes and 🤗 HuggingFace bridges [here](index.md#supported-models).
-This guide will cover the next steps to make use of a training recipe, including how to [override configuration](#overriding-configuration) and how to [launch a job](#launch-methods).
-
-## Overview
-
-- **Coverage**: We provide recipes across select model families and sizes, including Llama, Qwen, DeepSeek, and Nemotron-H (Mamba-based).
-- **Defaults**: Each recipe sets defaults meant for convergence and performance across parallelisms, precision data types, and optimizer & scheduler choices. These recipes can be used as a high-quality starting point. 
-- **Integration**: Recipes return a single `ConfigContainer` that plugs directly into our training [entry points](training/entry-points.md) (see the published docs as well: https://docs.nvidia.com/nemo/megatron-bridge/latest/training/entry-points.html).
-- **Customization**: You can override any part of the recipe (Python, YAML, CLI) to adapt to your data, scale, and objectives.
-
-## Overriding configuration
-
-Recipes are provided through a {py:class}`~bridge.training.config.ConfigContainer` object. This is a dataclass that holds all configuration objects needed for training. You can find a more detailed overview of the `ConfigContainer` [here](training/config-container-overview.md).
-The benefit of providing the full recipe through a pythonic structure is that it is agnostic to any configuration approach that a user may prefer, whether that's YAML, `argparse` or something else. In other words, the user may override the recipe however they see fit.
-
-The following sections detail a few different ways to override the configuration recipe. For a complete training script, please see [this example](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b.py).
-
-
-### Python
-
-If you prefer to manage configuration in Python, you can directly modify attributes of the `ConfigContainer`:
-
-```python
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-
-# Get the base ConfigContainer from the recipe
-cfg: ConfigContainer = pretrain_config()
-
-# Apply overrides. Note the hierarchical structure
-cfg.train.train_iters = 20
-cfg.train.global_batch_size = 8
-cfg.train.micro_batch_size = 1
-cfg.logger.log_interval = 1
-```
-
-You can also replace entire sub-configs of the `ConfigContainer`:
-
-```python
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-from megatron.bridge.models.llama import Llama3ModelProvider
-
-cfg: ConfigContainer = pretrain_config()
-
-small_llama = Llama3ModelProvider(
-    num_layers=2,
-    hidden_size=768,
-    ffn_hidden_size=2688,
-    num_attention_heads=16,
-)
-cfg.model = small_llama
-```
-
-### YAML
-Overriding a configuration recipe with a YAML file can be done using OmegaConf utilities:
-
-```python
-from omegaconf import OmegaConf
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-)
-
-cfg: ConfigContainer = pretrain_config()
-yaml_filepath = "conf/llama3-8b-benchmark-cfg.yaml"
-
-# Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-# excluded_fields holds some configuration that cannot be serialized into a DictConfig
-merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-# Load and merge YAML overrides
-yaml_overrides_omega = OmegaConf.load(yaml_filepath)
-merged_omega_conf = OmegaConf.merge(merged_omega_conf, yaml_overrides_omega)
-
-# Apply overrides while preserving excluded fields
-final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-```
-
-The above snippet will update `cfg` with all overrides from `llama3-8b-benchmark-cfg.yaml`.
-
-### Hydra-style
-
-Megatron Bridge provides some utilities to update the ConfigContainer using Hydra-style CLI overrides:
-
-```python
-import sys
-from omegaconf import OmegaConf
-from megatron.bridge.recipes.llama.llama3_8b import pretrain_config
-from megatron.bridge.training.utils.omegaconf_utils import (
-    apply_overrides,
-    create_omegaconf_dict_config,
-    parse_hydra_overrides,
-)
-
-cfg: ConfigContainer = pretrain_config()
-cli_overrides = sys.argv[1:]
-
-# Convert the initial Python dataclass to an OmegaConf DictConfig for merging
-# excluded_fields holds some configuration that cannot be serialized into a DictConfig
-merged_omega_conf, excluded_fields = create_omegaconf_dict_config(cfg)
-
-# Parse and merge CLI overrides
-merged_omega_conf = parse_hydra_overrides(merged_omega_conf, cli_overrides)
-
-# Apply overrides while preserving excluded fields
-final_overrides_as_dict = OmegaConf.to_container(merged_omega_conf, resolve=True)
-apply_overrides(cfg, final_overrides_as_dict, excluded_fields)
-```
-
-After the above snippet, `cfg` will be updated with all CLI-provided overrides. 
-A script containing the above code could be called like so:
-
-```sh
-torchrun <torchrun arguments> pretrain_cli_overrides.py model.tensor_model_parallel_size=4 train.train_iters=100000 ...
-```
-
-## Launch methods
-
-Megatron Bridge supports launching scripts with both `torchrun` and [NeMo-Run](https://github.com/NVIDIA-NeMo/Run).
-Once your script is ready to be launched, refer to one of the following sections.
-
-### Torchrun
-Megatron Bridge training scripts can be launched with the `torchrun` command that most PyTorch users are familiar with.
-Simply specify the number of GPUs to use with `--nproc-per-node` and the number of nodes with `--nnodes`. For example, on a single node:
-
-```sh
-torchrun --nnodes 1 --nproc-per-node 8 /path/to/train/script.py <args to pretrain script>
-```
-
-For multi-node training, it is recommended to use a cluster orchestration system like SLURM.
-The `torchrun` command should be wrapped as specified by your cluster orchestration system.
-For example, with Slurm, wrap the `torchrun` command inside of `srun`:
-
-```sh
-# launch.sub
-
-srun --nodes 2 --gpus-per-node 8 \
-    --container-image <image tag> --container-mounts <mounts> \
-    bash -c "
-        torchrun --nnodes $SLURM_NNODES --nproc-per-node $SLURM_GPUS_PER_NODE /path/to/train/script.py <args to pretrain script>
-    "
-```
-
-Along with any other required flags. It is also recommended to use a NeMo Framework container with Slurm. You can find a list of container tags on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags).
-
-### NeMo-Run
-
-Megatron Bridge also supports launching training with [NeMo-Run](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html). NeMo-Run is a Python package that enables configuring and executing experiments across several platforms.
-For multi-node training, NeMo-Run will generate a script with appropriate commands, similar to the `srun` command described above.
-
-The recommended method to launch a Megatron Bridge script with NeMo-Run is through the `run.Script` API.
-You can modify the following 3 steps to your needs in a new file:
-
-```python
-import nemo_run as run
-
-if __name__ == "__main__":
-    # 1) Configure the `run.Script` object
-    train_script = run.Script(path="/path/to/train/script.py", entrypoint="python")
-
-    # 2) Define an executor for the desired target platform
-    executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun")
-
-    # 3) Execute
-    run.run(train_script, executor=executor)
-```
-
-NeMo-Run supports launching on several different platforms, including [SLURM clusters](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#slurmexecutor).
-For more details, please see the NeMo-Run [documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/guides/execution.html#) for a list of supported platforms, their corresponding executors, and configuration instructions.
-
-You can also forward arguments from the NeMo-Run launch script to the target script:
-
-```python
-import nemo_run as run
-import argparse
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    ...
-    known_args, args_to_fwd = parser.parse_known_args()
-    train_script = run.Script(..., args=args_to_fwd)
-```
-
-For a complete example of the `run.Script` API, including argument forwarding, please see [this script](https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/models/llama/pretrain_llama3_8b_nemo_run_script.py).
-
-#### Plugins
-
-Megatron Bridge provides several NeMo-Run plugins to simplify the usage of certain features.
-These plugins can simply be added to the `run.run()` call:
-
-```python
-import nemo_run as run
-from megatron.bridge.recipes.run_plugins import NsysPlugin
-
-if __name__ == "__main__":
-    train_script = run.Script(path="/path/to/train/script.py", entrypoint="python")
-    executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun")
-
-    plugins = [] # plugins argument expects a list
-    nsys = NsysPlugin(profile_step_start=10, profile_step_end=15, ...)
-    plugins.append(nsys)
-    run.run(train_script, plugins=plugins, executor=executor)
-```
-
-##### Custom Argument Converters
-
-By default, plugins convert their configuration to Hydra-style CLI arguments when used with `run.Script` tasks. If your training script uses a different argument format (e.g., argparse), you can provide a custom converter function via the `script_args_converter_fn` parameter.
-
-```python
-import nemo_run as run
-from typing import List
-from megatron.bridge.recipes.run_plugins import (
-    PreemptionPlugin,
-    PreemptionPluginScriptArgs,
-)
-
-# Define a custom converter for argparse-style arguments
-def argparse_preemption_converter(args: PreemptionPluginScriptArgs) -> List[str]:
-    result = []
-    if args.enable_exit_handler:
-        result.append("--enable-exit-handler")
-    if args.enable_exit_handler_for_data_loader:
-        result.append("--enable-exit-handler-dataloader")
-    return result
-
-if __name__ == "__main__":
-    train_script = run.Script(path="/path/to/train/script.py", entrypoint="python")
-    executor = run.LocalExecutor(ntasks_per_node=8, launcher="torchrun")
-
-    # Use the plugin with the custom converter
-    plugin = PreemptionPlugin(
-        preempt_time=120,
-        enable_exit_handler=True,
-        script_args_converter_fn=argparse_preemption_converter,
-    )
-    run.run(train_script, plugins=[plugin], executor=executor)
-```
-
-Each plugin provides its own corresponding dataclass (e.g., `PreemptionPluginScriptArgs`, `NsysPluginScriptArgs`) that defines the available arguments for conversion.
-
-See the [API reference](#bridge.recipes.run_plugins) for a list of available NeMo-Run plugins.
-
-### Avoiding Hangs
-
-When working with any scripts in Megatron Bridge, please make sure you wrap your code in an `if __name__ == "__main__":`
-block. Otherwise, your code may hang unexpectedly.
-
-The reason for this is that Megatron Bridge uses Python's `multiprocessing` module in the backend when running a
-multi-GPU job. The multiprocessing module will create new Python processes that will import the current module (your
-script). If you did not add `__name__== "__main__"`,  then your module will spawn new processes which import the
-module and then each spawn new processes. This results in an infinite loop of process spawning.
-
-## Resources
-
-- [OmegaConf documentation](https://omegaconf.readthedocs.io/en/2.3_branch/)
-- [torchrun Documentation](https://docs.pytorch.org/docs/stable/elastic/run.html)
-- [PyTorch Multinode Training documentation](https://docs.pytorch.org/tutorials/intermediate/ddp_series_multinode.html)
-- [NeMo-Run documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemorun/index.html#)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/skills/mlm-bridge-training/SKILL.md
-```md
----
-name: mlm-bridge-training
-description: Run Megatron-LM (MLM) and Megatron Bridge training with mock or real data. Covers correlation testing, available recipes, and multi-GPU examples. Use when running training, comparing MLM vs Bridge, or translating configs.
----
-
-# MLM vs Bridge Training
-
-For how they differ, the arg mapping tables, gotchas, and translation script, see:
-
-- `docs/megatron-lm-to-megatron-bridge.md`
-
-## Correlation Testing
-
-Use `vanilla_gpt_pretrain_config` for loss-correlation testing. This recipe uses
-bare `GPTModelProvider` defaults (LayerNorm, GeLU, learned_absolute position
-embeddings, `vocab_size` inherited from tokenizer) — matching MLM
-`pretrain_gpt.py` defaults with no args.
-
-### MLM Correlation Run (2L/256H, 1 GPU)
-
-```bash
-PYTHONPATH=3rdparty/Megatron-LM:$PYTHONPATH \
-uv run python -m torch.distributed.run --nproc_per_node=1 \
-  3rdparty/Megatron-LM/pretrain_gpt.py \
-  --num-layers 2 --hidden-size 256 --num-attention-heads 4 \
-  --ffn-hidden-size 1024 --seq-length 512 --max-position-embeddings 512 \
-  --micro-batch-size 4 --global-batch-size 32 \
-  --train-iters 10 --eval-iters 2 --eval-interval 10 \
-  --mock-data --bf16 --use-mcore-models \
-  --tokenizer-type NullTokenizer --vocab-size 32000 \
-  --lr 3e-4 --min-lr 3e-5 --seed 1234 --log-interval 1
-```
-
-### Bridge Correlation Run (same config, 1 GPU)
-
-```bash
-rm -rf nemo_experiments && \
-uv run python -m torch.distributed.run --nproc_per_node=1 \
-  scripts/training/run_recipe.py \
-  --recipe vanilla_gpt_pretrain_config \
-  model.num_layers=2 model.hidden_size=256 \
-  model.num_attention_heads=4 model.ffn_hidden_size=1024 \
-  model.seq_length=512 dataset.sequence_length=512 \
-  train.train_iters=10 train.global_batch_size=32 train.micro_batch_size=4 \
-  validation.eval_interval=10 validation.eval_iters=2 \
-  optimizer.lr=3e-4 optimizer.min_lr=3e-5 \
-  scheduler.lr_warmup_iters=1 scheduler.lr_decay_iters=10 \
-  rng.seed=1234 logger.log_interval=1
-```
-
-### Verification
-
-With matched parameters the LM losses should be nearly identical at each
-iteration. Compare `lm loss` values from both logs — they should agree to
-within BF16 rounding.
-
-## Multi-GPU Examples
-
-### MLM 2-GPU with TP=2
-
-```bash
-PYTHONPATH=3rdparty/Megatron-LM:$PYTHONPATH \
-uv run python -m torch.distributed.run --nproc_per_node=2 \
-  3rdparty/Megatron-LM/pretrain_gpt.py \
-  --tensor-model-parallel-size 2 --sequence-parallel \
-  --num-layers 4 --hidden-size 256 --num-attention-heads 4 \
-  --seq-length 1024 --max-position-embeddings 1024 \
-  --micro-batch-size 2 --global-batch-size 16 \
-  --train-iters 10 --eval-iters 2 --eval-interval 10 \
-  --mock-data --bf16 --use-mcore-models \
-  --tokenizer-type NullTokenizer --vocab-size 1024 \
-  --lr 1e-4 --log-interval 1
-```
-
-### Bridge 2-GPU with TP=2
-
-```bash
-rm -rf nemo_experiments && \
-uv run python -m torch.distributed.run --nproc_per_node=2 \
-  scripts/training/run_recipe.py \
-  --recipe vanilla_gpt_pretrain_config \
-  model.tensor_model_parallel_size=2 model.sequence_parallel=true \
-  model.num_layers=4 model.hidden_size=256 \
-  model.num_attention_heads=4 model.ffn_hidden_size=1024 \
-  model.seq_length=1024 dataset.sequence_length=1024 \
-  train.train_iters=10 train.global_batch_size=16 train.micro_batch_size=2 \
-  validation.eval_interval=10 validation.eval_iters=2 \
-  scheduler.lr_warmup_iters=2 scheduler.lr_decay_iters=10 \
-  logger.log_interval=1
-```
-
-## Available Recipes
-
-Common recipes (use with `--recipe`):
-
-- `vanilla_gpt_pretrain_config` — Minimal GPT (bare GPTModelProvider defaults,
-  ideal for correlation testing and custom configs)
-- `llama32_1b_pretrain_config` — Llama 3.2 1B (16L, 2048H, GBS=512, seq=8192)
-- `llama3_8b_pretrain_config` — Llama 3 8B
-- `qwen3_8b_pretrain_config` — Qwen3 8B
-- `deepseek_v2_lite_pretrain_config` — DeepSeek-V2-Lite 16B MoE
-
-SFT/PEFT variants use `_sft_config` / `_peft_config` suffix.
-
-## Megatron-Core Submodule
-
-For what the submodule is and why two versions exist, see
-`docs/megatron-lm-to-megatron-bridge.md`.
-
-### Check current version
-
-```bash
-./scripts/switch_mcore.sh status
-```
-
-### Switch to dev for testing newer MCore features
-
-```bash
-./scripts/switch_mcore.sh dev
-
-# uv sync (without --locked) since lockfile is for main
-uv sync
-```
-
-### Switch back to main
-
-```bash
-./scripts/switch_mcore.sh main
-```
-
-### After pulling latest main
-
-When you pull the latest Bridge main branch, the submodule pointer may have
-been updated. Re-sync the submodule:
-
-```bash
-git submodule update --init 3rdparty/Megatron-LM
-```
-
-## Pitfalls
-
-1. **Always `rm -rf nemo_experiments`** before a fresh correlation run. Bridge
-   auto-resumes from stale checkpoints silently.
-
-2. **`uv run` required**: Always use `uv run python -m torch.distributed.run`
-   (not bare `torchrun` or `python`).
-
-3. **MLM PYTHONPATH**: Must include `3rdparty/Megatron-LM` so `gpt_builders.py`
-   is importable.
-
-4. **Scheduler overrides**: When overriding `train.train_iters` to a small
-   value, also set `scheduler.lr_warmup_iters` and `scheduler.lr_decay_iters`
-   or you get an assertion error.
-
-5. **Use `dataset.sequence_length`** in CLI overrides, not `dataset.seq_length`.
-
-6. **MoE OOM**: Large MoE models require full activation recomputation and
-   typically multi-node EP. TP does NOT reduce per-GPU expert memory.
-
-7. **`uv sync --locked` fails after switching to dev**: The lockfile is generated
-   against the main MCore commit. Use `uv sync` (without `--locked`) when on dev.
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/skills/perf-techniques/sequence-packing/SKILL.md
-```md
----
-name: sequence-packing
-description: Operational guide for enabling packed sequences and long-context config paths in Megatron-Bridge, including config knobs, code anchors, pitfalls, and verification.
----
-
-# Sequence Packing Skill
-
-For stable background and recommendation level, see:
-
-- `docs/training/packed-sequences.md`
-- `card.yaml` (co-located)
-
-## Enablement
-
-Offline packed SFT for LLM finetuning:
-
-```python
-from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-
-cfg.train.micro_batch_size = 1
-cfg.dataset.seq_length = 4096
-cfg.model.seq_length = 4096
-cfg.dataset.dataset_kwargs = {"pad_to_max_length": True}
-cfg.dataset.packed_sequence_specs = PackedSequenceSpecs(
-    packed_sequence_size=4096,
-    pad_seq_to_mult=1,
-)
-```
-
-If CP is enabled:
-
-```python
-cfg.model.context_parallel_size = 2
-cfg.model.calculate_per_token_loss = True
-cfg.ddp.average_in_collective = False
-cfg.dataset.packed_sequence_specs.pad_seq_to_mult = cfg.model.context_parallel_size * 2
-```
-
-If CUDA graphs are enabled for this packed path:
-
-```python
-cfg.dataset.packed_sequence_specs.pad_cu_seqlens = True
-cfg.dataset.dataset_kwargs["pad_to_max_length"] = True
-```
-
-**Note:** `pad_cu_seqlens = True` also requires a metadata JSON file alongside
-the packed dataset (asserted in `src/megatron/bridge/data/datasets/sft.py`).
-Custom packed datasets that omit the metadata file will hit an assertion at
-dataset initialization.
-
-In-batch packing for VLM finetuning:
-
-```python
-cfg.dataset.pack_sequences_in_batch = True
-cfg.train.micro_batch_size = 2
-```
-
-Long-context baseline:
-
-```python
-cfg.model.seq_length = 16384
-cfg.dataset.seq_length = 16384
-cfg.model.context_parallel_size = 2
-```
-
-## Code Anchors
-
-LLM packed SFT config surface:
-
-```72:97:src/megatron/bridge/recipes/utils/finetune_utils.py
-if packed_sequence:
-    dataset_kwargs = {"pad_to_max_length": True}
-    packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult)
-else:
-    dataset_kwargs = {}
-    packed_sequence_specs = None
-```
-
-Bridge validation:
-
-```1617:1657:src/megatron/bridge/training/config.py
-if self.model.context_parallel_size > 1:
-    assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, ...
-    if isinstance(self.dataset, FinetuningDatasetConfig):
-        assert self.model.calculate_per_token_loss, ...
-        assert not self.ddp.average_in_collective, ...
-...
-if ... packed_sequence_size > 0 and self.train.micro_batch_size > 1:
-    raise ValueError(...)
-...
-if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1:
-    raise ValueError(...)
-```
-
-VLM in-batch runtime:
-
-```308:327:src/megatron/bridge/training/vlm_step.py
-if enable_packing:
-    ...
-    ) = pack_batch_sequences(
-        ...
-        pad_token_id=0,
-        pad_to_multiple_of=cp_size * 2 if cp_size > 1 else 1,
-    )
-```
-
-Packed THD runtime constraint:
-
-```61:64:src/megatron/bridge/training/gpt_step.py
-if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1:
-    raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)")
-```
-
-## Pitfalls
-
-1. Offline packed SFT and VLM in-batch packing are different features with opposite micro-batch rules.
-2. When CP is enabled, packed sequence lengths must respect `2 * context_parallel_size` divisibility.
-3. For finetuning with CP, `calculate_per_token_loss=True` and `ddp.average_in_collective=False` are required.
-4. `pad_cu_seqlens=True` also requires `pad_to_max_length=True`.
-5. Packing support is model-family-specific. `Qwen3-Next`, `GLM-4.5`, and `Qwen3.5-VL` contain explicit opt-outs in different paths.
-6. MTP finetuning is documented as incompatible with packed sequences.
-
-## Verification
-
-Use the checked-in unit coverage:
-
-```bash
-uv run python -m pytest tests/unit_tests/training/utils/test_packed_seq_utils.py -v && \
-uv run python -m pytest tests/unit_tests/training/test_config.py -k "packed_sequence or pack_sequences_in_batch or context_parallel_seq_length_divisibility or context_parallel_finetuning_validations" -v && \
-uv run python -m pytest tests/unit_tests/training/test_vlm_step.py -k "enable_packing" -v
-```
-
-Success criteria:
-
-- first command reports `8 passed`
-- second command reports `14 passed`
-- third command reports `2 passed`
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/common.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from megatron.core.distributed import DistributedDataParallelConfig
-
-from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-from megatron.bridge.peft.lora import LoRA
-from megatron.bridge.recipes.utils.finetune_utils import default_squad_config
-from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
-from megatron.bridge.recipes.utils.tokenizer_utils import DEFAULT_NULL_TOKENIZER_VOCAB_SIZE
-from megatron.bridge.training.config import (
-    CheckpointConfig,
-    ConfigContainer,
-    DistributedInitConfig,
-    GPTDatasetConfig,
-    LoggerConfig,
-    RNGConfig,
-    TokenizerConfig,
-    TrainingConfig,
-    ValidationConfig,
-)
-
-
-def _pretrain_common() -> ConfigContainer:
-    """Create a base pre-training ConfigContainer with common defaults for any language model.
-
-    This function returns a ConfigContainer template with sensible defaults.
-    The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model` before use.
-
-    Returns:
-        ConfigContainer: Base configuration template for pre-training.
-    """
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default optimizer and scheduler
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=500,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=3e-4,
-        min_lr=3e-5,
-    )
-
-    cfg = ConfigContainer(
-        # Model - MUST be set by each recipe before use
-        model=None,  # type: ignore[arg-type]
-        # Training config
-        train=TrainingConfig(
-            train_iters=300000,
-            global_batch_size=32,
-            micro_batch_size=2,
-            manual_gc=True,
-            manual_gc_interval=100,
-            manual_gc_eval=100,
-        ),
-        validation=ValidationConfig(
-            eval_interval=500,
-            eval_iters=32,
-        ),
-        # Optimizer and scheduler
-        optimizer=opt_cfg,
-        scheduler=scheduler_cfg,
-        # DDP config - these are the commonly overridden settings
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-            overlap_grad_reduce=True,
-            overlap_param_gather=True,
-            average_in_collective=True,
-            data_parallel_sharding_strategy="optim_grads_params",
-            use_distributed_optimizer=True,
-        ),
-        # Dataset config - uses mock data by default
-        dataset=GPTDatasetConfig(
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            seq_length=4096,
-            num_dataset_builder_threads=1,
-            blend=None,  # Mock data mode
-            blend_per_split=None,
-            split="9999,8,2",
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        ),
-        # Logger config
-        logger=LoggerConfig(
-            log_interval=10,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        # Tokenizer - placeholder, each recipe should set tokenizer_model
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=None,  # Must be set by each recipe
-        ),
-        # Checkpoint config
-        checkpoint=CheckpointConfig(
-            save_interval=500,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        # RNG config
-        rng=RNGConfig(seed=1234),
-        # Distributed init config
-        dist=DistributedInitConfig(),
-        comm_overlap=None,
-        # Mixed precision - bf16 by default
-        mixed_precision="bf16_mixed",
-    )
-
-    return cfg
-
-
-def _sft_common() -> ConfigContainer:
-    """Create a base SFT (Supervised Fine-Tuning) ConfigContainer with common defaults.
-
-    This function returns a ConfigContainer template with sensible defaults for full SFT
-    (not LoRA/DoRA). The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model`
-    before use.
-
-    Key differences from pre-training:
-    - Uses HFDatasetConfig with SQuAD as default dataset
-    - Lower learning rate (5e-6) suitable for full fine-tuning
-    - Fewer training iterations (1000)
-    - Smaller batch sizes
-    - Supports pretrained_checkpoint loading
-    - No PEFT (full parameter training)
-
-    Returns:
-        ConfigContainer: Base configuration template for full SFT.
-    """
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for SFT
-    seq_length = 2048
-
-    # Packed sequence is enabled by default for training efficiency
-    # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1
-    packed_sequence = True
-    pad_seq_to_mult = 1  # Override in model config if context_parallel_size > 1
-
-    # Optimizer and scheduler with lower LR for full SFT
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=50,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=5e-6,  # Lower LR for full fine-tuning
-        min_lr=0.0,
-        adam_beta2=0.98,  # Common for fine-tuning
-    )
-
-    cfg = ConfigContainer(
-        # Model - MUST be set by each recipe before use
-        model=None,  # type: ignore[arg-type]
-        # Training config - shorter training for SFT
-        train=TrainingConfig(
-            train_iters=1000,
-            global_batch_size=128,
-            micro_batch_size=1,
-        ),
-        validation=ValidationConfig(
-            eval_interval=100,
-            eval_iters=32,
-        ),
-        # Optimizer and scheduler
-        optimizer=opt_cfg,
-        scheduler=scheduler_cfg,
-        # DDP config - minimal settings, model-specific configs can override
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-        ),
-        # Dataset config - uses SQuAD with packed sequences by default
-        dataset=default_squad_config(
-            seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult
-        ),
-        # Logger config
-        logger=LoggerConfig(
-            log_interval=1,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        # Tokenizer - placeholder, each recipe should set tokenizer_model
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=None,  # Must be set by each recipe
-        ),
-        # Checkpoint config with pretrained_checkpoint support
-        checkpoint=CheckpointConfig(
-            save_interval=100,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            pretrained_checkpoint=None,  # Set to load from pretrained weights
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        # RNG config - different seed from pretrain
-        rng=RNGConfig(seed=5678),
-        # Distributed init config
-        dist=DistributedInitConfig(),
-        comm_overlap=None,
-        # Mixed precision - bf16 by default
-        mixed_precision="bf16_mixed",
-        # No PEFT for full SFT
-        peft=None,
-    )
-
-    return cfg
-
-
-def _peft_common() -> ConfigContainer:
-    """Create a base PEFT (Parameter-Efficient Fine-Tuning) ConfigContainer with LoRA defaults.
-
-    This function returns a ConfigContainer template with sensible defaults for PEFT
-    using LoRA. The caller MUST set `cfg.model` and `cfg.tokenizer.tokenizer_model`
-    before use.
-
-    Key differences from full SFT:
-    - Higher learning rate (1e-4) suitable for adapter training
-    - LoRA enabled by default with standard settings (dim=32, alpha=32)
-    - Targets all linear layers: linear_qkv, linear_proj, linear_fc1, linear_fc2
-
-    Returns:
-        ConfigContainer: Base configuration template for PEFT with LoRA.
-    """
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for PEFT
-    seq_length = 2048
-
-    # Packed sequence is enabled by default for training efficiency
-    # pad_seq_to_mult should be set to context_parallel_size * 2 if CP > 1
-    packed_sequence = True
-    pad_seq_to_mult = 1  # Override in model config if context_parallel_size > 1
-
-    # Optimizer and scheduler with higher LR for PEFT (only training adapters)
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=50,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=1e-4,  # Higher LR for adapter training
-        min_lr=0.0,
-        adam_beta2=0.98,  # Common for fine-tuning
-    )
-
-    cfg = ConfigContainer(
-        # Model - MUST be set by each recipe before use
-        model=None,  # type: ignore[arg-type]
-        # Training config - shorter training for PEFT
-        train=TrainingConfig(
-            train_iters=1000,
-            global_batch_size=128,
-            micro_batch_size=1,
-        ),
-        validation=ValidationConfig(
-            eval_interval=100,
-            eval_iters=32,
-        ),
-        # Optimizer and scheduler
-        optimizer=opt_cfg,
-        scheduler=scheduler_cfg,
-        # DDP config - minimal settings for PEFT
-        ddp=DistributedDataParallelConfig(
-            check_for_nan_in_grad=True,
-            grad_reduce_in_fp32=True,
-        ),
-        # Dataset config - uses SQuAD with packed sequences by default
-        dataset=default_squad_config(
-            seq_length=seq_length, packed_sequence=packed_sequence, pad_seq_to_mult=pad_seq_to_mult
-        ),
-        # Logger config
-        logger=LoggerConfig(
-            log_interval=1,
-            tensorboard_dir=tensorboard_dir,
-            log_timers_to_tensorboard=True,
-        ),
-        # Tokenizer - placeholder, each recipe should set tokenizer_model
-        tokenizer=TokenizerConfig(
-            tokenizer_type="HuggingFaceTokenizer",
-            tokenizer_model=None,  # Must be set by each recipe
-        ),
-        # Checkpoint config with pretrained_checkpoint support
-        checkpoint=CheckpointConfig(
-            save_interval=100,
-            save=checkpoint_dir,
-            load=checkpoint_dir,
-            pretrained_checkpoint=None,  # Set to load from pretrained weights
-            ckpt_format="torch_dist",
-            fully_parallel_save=True,
-        ),
-        # RNG config - different seed from pretrain
-        rng=RNGConfig(seed=5678),
-        # Distributed init config
-        dist=DistributedInitConfig(),
-        comm_overlap=None,
-        # Mixed precision - bf16 by default
-        mixed_precision="bf16_mixed",
-        # LoRA config with standard defaults
-        peft=LoRA(
-            target_modules=["linear_qkv", "linear_proj", "linear_fc1", "linear_fc2"],
-            dim=32,
-            alpha=32,
-            dropout=0.0,
-            dropout_position="pre",
-            lora_A_init_method="xavier",
-            lora_B_init_method="zero",
-            a2a_experimental=False,
-            lora_dtype=None,  # Uses model's dtype
-        ),
-    )
-
-    return cfg
-
-
-def _sft_common_vlm() -> ConfigContainer:
-    """Create a base SFT ConfigContainer with common defaults for Vision-Language Models.
-
-    This function inherits from `_sft_common()` and overrides VLM-specific settings.
-    The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use.
-
-    Key differences from LLM SFT (`_sft_common`):
-    - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2)
-    - Uses NullTokenizer (VLMs use processor instead of tokenizer)
-    - DDP config optimized for VLM training (no grad/param overlap)
-    - Supports freeze options for language_model, vision_model, vision_projection
-    - Different training defaults (train_iters=300000, GBS=32, MBS=2)
-    - Different RNG seed (1234)
-
-    Returns:
-        ConfigContainer: Base configuration template for VLM full SFT.
-    """
-    # Start from the LLM SFT common config
-    cfg = _sft_common()
-
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for VLM
-    seq_length = 4096
-
-    # VLM-specific training config - longer training with different batch sizes
-    cfg.train.train_iters = 300000
-    cfg.train.global_batch_size = 32
-    cfg.train.micro_batch_size = 2
-    cfg.train.manual_gc = True
-    cfg.train.manual_gc_interval = 100
-    cfg.train.manual_gc_eval = 100
-
-    # VLM-specific validation config
-    cfg.validation.eval_interval = 500
-    cfg.validation.eval_iters = 32
-
-    # VLM-specific optimizer settings - higher LR for VLM training
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=500,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=3e-4,
-        min_lr=3e-5,
-    )
-    cfg.optimizer = opt_cfg
-    cfg.scheduler = scheduler_cfg
-
-    # VLM-specific DDP config - no overlap for VLMs
-    cfg.ddp = DistributedDataParallelConfig(
-        check_for_nan_in_grad=True,
-        grad_reduce_in_fp32=True,
-        overlap_grad_reduce=False,
-        overlap_param_gather=False,
-        average_in_collective=True,
-        data_parallel_sharding_strategy="optim_grads_params",
-        use_distributed_optimizer=True,
-    )
-
-    # VLM-specific dataset - uses HuggingFace dataset provider
-    # hf_processor_path must be set by model-specific config
-    cfg.dataset = HFDatasetConversationProvider(
-        seq_length=seq_length,
-        hf_processor_path=None,  # Must be set by model-specific config
-        maker_name="make_cord_v2_dataset",
-        num_workers=2,
-        dataloader_type="single",
-        data_sharding=True,
-        pin_memory=True,
-        persistent_workers=False,
-        pack_sequences_in_batch=True,
-    )
-
-    # VLM uses NullTokenizer - actual tokenization is handled by the processor
-    cfg.tokenizer = TokenizerConfig(
-        tokenizer_type="NullTokenizer",
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # VLM-specific logger config
-    cfg.logger = LoggerConfig(
-        log_interval=10,
-        tensorboard_dir=tensorboard_dir,
-        log_timers_to_tensorboard=True,
-    )
-
-    # VLM-specific checkpoint config
-    cfg.checkpoint.save_interval = 500
-    cfg.checkpoint.save = checkpoint_dir
-    cfg.checkpoint.load = checkpoint_dir
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.fully_parallel_save = True
-
-    # VLM uses different RNG seed
-    cfg.rng = RNGConfig(seed=1234)
-
-    return cfg
-
-
-def _peft_common_vlm() -> ConfigContainer:
-    """Create a base PEFT ConfigContainer with LoRA defaults for Vision-Language Models.
-
-    This function inherits from `_peft_common()` and overrides VLM-specific settings.
-    The caller MUST set `cfg.model` and `cfg.dataset.hf_processor_path` before use.
-
-    Key differences from LLM PEFT (`_peft_common`):
-    - Uses HFDatasetConversationProvider with HuggingFace datasets (e.g., CORD-v2)
-    - Uses NullTokenizer (VLMs use processor instead of tokenizer)
-    - DDP config optimized for VLM training (no grad/param overlap)
-    - Supports freeze options for language_model, vision_model, vision_projection
-    - Different training defaults (train_iters=300000, GBS=32, MBS=2)
-    - Different RNG seed (1234)
-    - Higher LR (1e-4) for adapter training
-
-    Returns:
-        ConfigContainer: Base configuration template for VLM PEFT with LoRA.
-    """
-    # Start from the LLM PEFT common config
-    cfg = _peft_common()
-
-    # Default output directories
-    base_output_dir = os.path.join(os.getcwd(), "nemo_experiments")
-    run_output_dir = os.path.join(base_output_dir, "default")
-    checkpoint_dir = os.path.join(run_output_dir, "checkpoints")
-    tensorboard_dir = os.path.join(run_output_dir, "tb_logs")
-
-    # Default sequence length for VLM
-    seq_length = 4096
-
-    # VLM-specific training config - longer training with different batch sizes
-    cfg.train.train_iters = 300000
-    cfg.train.global_batch_size = 32
-    cfg.train.micro_batch_size = 2
-    cfg.train.manual_gc = True
-    cfg.train.manual_gc_interval = 100
-    cfg.train.manual_gc_eval = 100
-
-    # VLM-specific validation config
-    cfg.validation.eval_interval = 500
-    cfg.validation.eval_iters = 32
-
-    # VLM-specific optimizer settings - higher LR for PEFT
-    opt_cfg, scheduler_cfg = distributed_fused_adam_with_cosine_annealing(
-        lr_warmup_iters=500,
-        lr_decay_iters=None,  # Defaults to train_iters during validation
-        max_lr=1e-4,  # Higher LR for adapter training
-        min_lr=1e-5,
-    )
-    cfg.optimizer = opt_cfg
-    cfg.scheduler = scheduler_cfg
-
-    # VLM-specific DDP config - no overlap for VLMs
-    cfg.ddp = DistributedDataParallelConfig(
-        check_for_nan_in_grad=True,
-        grad_reduce_in_fp32=True,
-        overlap_grad_reduce=False,
-        overlap_param_gather=False,
-        average_in_collective=True,
-        data_parallel_sharding_strategy="optim_grads_params",
-        use_distributed_optimizer=True,
-    )
-
-    # VLM-specific dataset - uses HuggingFace dataset provider
-    # hf_processor_path must be set by model-specific config
-    cfg.dataset = HFDatasetConversationProvider(
-        seq_length=seq_length,
-        hf_processor_path=None,  # Must be set by model-specific config
-        maker_name="make_cord_v2_dataset",
-        num_workers=2,
-        dataloader_type="single",
-        data_sharding=True,
-        pin_memory=True,
-        persistent_workers=False,
-        pack_sequences_in_batch=True,
-    )
-
-    # VLM uses NullTokenizer - actual tokenization is handled by the processor
-    cfg.tokenizer = TokenizerConfig(
-        tokenizer_type="NullTokenizer",
-        vocab_size=DEFAULT_NULL_TOKENIZER_VOCAB_SIZE,
-    )
-
-    # VLM-specific logger config
-    cfg.logger = LoggerConfig(
-        log_interval=10,
-        tensorboard_dir=tensorboard_dir,
-        log_timers_to_tensorboard=True,
-    )
-
-    # VLM-specific checkpoint config
-    cfg.checkpoint.save_interval = 500
-    cfg.checkpoint.save = checkpoint_dir
-    cfg.checkpoint.load = checkpoint_dir
-    cfg.checkpoint.ckpt_format = "torch_dist"
-    cfg.checkpoint.fully_parallel_save = True
-
-    # VLM uses different RNG seed
-    cfg.rng = RNGConfig(seed=1234)
-
-    # Keep LoRA config from _peft_common() - it's already set with standard defaults
-
-    return cfg
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/utils/finetune_utils.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Utility functions for finetuning recipes."""
-
-from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig
-from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-from megatron.bridge.data.hf_processors.gsm8k import process_gsm8k_example
-from megatron.bridge.data.hf_processors.openmathinstruct2 import process_openmathinstruct2_example
-from megatron.bridge.data.hf_processors.squad import process_squad_example
-from megatron.bridge.peft.base import PEFT
-from megatron.bridge.peft.dora import DoRA
-from megatron.bridge.peft.lora import LoRA
-
-
-def default_peft_config(peft_scheme: str | PEFT | None, **kwargs) -> PEFT | None:
-    """Create default PEFT configuration matching NeMo2 exactly.
-
-    Args:
-        peft_scheme: PEFT scheme - 'lora', 'dora', PEFT instance, or None for full finetuning
-
-    Returns:
-        PEFT configuration or None for full finetuning
-    """
-    if peft_scheme is None:
-        return None  # Full finetuning
-
-    if isinstance(peft_scheme, PEFT):
-        return peft_scheme  # User provided custom PEFT
-
-    if isinstance(peft_scheme, str):
-        if peft_scheme.lower() == "none":
-            return None
-        if peft_scheme.lower() == "lora":
-            return LoRA(**kwargs)
-        elif peft_scheme.lower() == "dora":
-            return DoRA(**kwargs)
-        else:
-            raise ValueError(f"Unknown PEFT scheme: {peft_scheme}. Supported: 'lora', 'dora', or None")
-
-    raise ValueError(f"Invalid peft type: {type(peft_scheme)}. Expected str, PEFT instance, or None")
-
-
-def default_squad_config(seq_length: int, packed_sequence: bool = True, pad_seq_to_mult: int = 1) -> HFDatasetConfig:
-    """Create default SQuAD dataset configuration for finetuning recipes.
-
-    Args:
-        seq_length: Sequence length for the dataset
-        packed_sequence: Whether to enable packed sequences for training efficiency
-        pad_seq_to_mult: Optional multiple to pad each sequence to when packing
-            (set to `2 * context_parallel_size` for THD CP runs).
-
-    Returns:
-        HFDatasetConfig configured for SQuAD finetuning
-
-    Note:
-        Uses consistent settings across all finetuning recipes:
-        - SQuAD dataset with appropriate dataloader type
-        - 10% validation split
-        - Seed 5678 (different from pretrain seed 1234)
-        - Packed sequences when enabled improve training efficiency
-    """
-    if packed_sequence:
-        # Packed sequence configuration
-        dataset_kwargs = {"pad_to_max_length": True}
-        packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult)
-    else:
-        # Standard configuration
-        dataset_kwargs = {}
-        packed_sequence_specs = None
-
-    # Use 'batch' sampler for variable-length finetuning
-    # Samples full global batch to ensure consistent padding across all microbatches
-    dataloader_type = "batch"
-
-    return HFDatasetConfig(
-        dataset_name="squad",
-        process_example_fn=process_squad_example,
-        seq_length=seq_length,
-        seed=5678,  # Different from pretrain seed
-        dataloader_type=dataloader_type,
-        num_workers=1,
-        do_validation=True,
-        do_test=False,
-        val_proportion=0.1,
-        dataset_kwargs=dataset_kwargs,
-        packed_sequence_specs=packed_sequence_specs,
-        rewrite=False,
-    )
-
-
-def default_openmathinstruct2_config(
-    seq_length: int = 4096,
-    packed_sequence: bool = False,
-    pad_seq_to_mult: int = 1,
-) -> HFDatasetConfig:
-    """Create default OpenMathInstruct-2 dataset configuration for finetuning recipes."""
-    # Create packed sequence specs if needed
-    packed_sequence_specs = None
-    if packed_sequence:
-        packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult)
-
-    return HFDatasetConfig(
-        dataset_name="nvidia/OpenMathInstruct-2",  # Hugging Face dataset name
-        split="train_1M",  # Default to the 1M subset
-        process_example_fn=process_openmathinstruct2_example,  # Processing function
-        seq_length=seq_length,
-        seed=5678,
-        memmap_workers=1,
-        # Dataloader config parameters
-        dataloader_type="batch",
-        do_validation=True,
-        do_test=False,
-        val_proportion=0.05,  # 950k train, 50k val
-        num_workers=2,
-        data_sharding=True,
-        pin_memory=True,
-        persistent_workers=False,
-        packed_sequence_specs=packed_sequence_specs,
-        rewrite=False,  # Rewrite existing processed files
-    )
-
-
-def default_gsm8k_config(
-    seq_length: int = 2048,
-    packed_sequence: bool = False,
-    pad_seq_to_mult: int = 1,
-) -> HFDatasetConfig:
-    """Create default GSM8K dataset configuration for finetuning recipes.
-
-    GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse
-    grade school math word problems. See: https://huggingface.co/datasets/openai/gsm8k
-
-    Args:
-        seq_length: Sequence length for the dataset (default 2048, sufficient for GSM8K)
-        packed_sequence: Whether to enable packed sequences for training efficiency
-        pad_seq_to_mult: Optional multiple to pad each sequence to when packing
-            (set to `2 * context_parallel_size` for THD CP runs).
-
-    Returns:
-        HFDatasetConfig configured for GSM8K finetuning
-
-    Note:
-        - GSM8K has 7,473 train and 1,319 test examples
-        - Loads the full DatasetDict so the published test split is used for evaluation
-        - Uses 'batch' dataloader type for variable-length finetuning
-    """
-    # Create packed sequence specs if needed
-    packed_sequence_specs = None
-    if packed_sequence:
-        packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult)
-
-    return HFDatasetConfig(
-        dataset_name="openai/gsm8k",  # Hugging Face dataset name
-        dataset_subset="main",  # 'main' or 'socratic'
-        process_example_fn=process_gsm8k_example,  # Processing function
-        seq_length=seq_length,
-        seed=5678,
-        memmap_workers=1,
-        # Dataloader config parameters
-        dataloader_type="batch",
-        do_validation=False,
-        do_test=True,
-        num_workers=2,
-        data_sharding=True,
-        pin_memory=True,
-        persistent_workers=False,
-        packed_sequence_specs=packed_sequence_specs,
-        rewrite=False,
-    )
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/recipes/utils/dataset_utils.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Dataset configuration utilities for recipes and training scripts."""
-
-import logging
-from typing import Callable, List, Optional, Tuple
-
-from megatron.bridge.data.energon.energon_provider import EnergonProvider
-from megatron.bridge.data.loaders import get_blend_and_blend_per_split
-from megatron.bridge.data.vlm_datasets.hf_provider import HFDatasetConversationProvider
-from megatron.bridge.data.vlm_datasets.preloaded_provider import PreloadedVLMConversationProvider
-from megatron.bridge.recipes.utils.finetune_utils import (
-    default_gsm8k_config,
-    default_openmathinstruct2_config,
-    default_squad_config,
-)
-from megatron.bridge.training.config import (
-    ConfigContainer,
-    FinetuningDatasetConfig,
-    GPTDatasetConfig,
-    MockGPTDatasetConfig,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-_BLEND_TYPE = Optional[Tuple[List[str], Optional[List[float]]]]
-_BLEND_PER_SPLIT_TYPE = Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]
-_SPLIT_TYPE = Optional[str]
-
-
-def get_blend_fields_from_data_paths(
-    data_paths: Optional[List[str]] = None,
-    data_args_path: Optional[str] = None,
-    train_data_path: Optional[List[str]] = None,
-    valid_data_path: Optional[List[str]] = None,
-    test_data_path: Optional[List[str]] = None,
-    per_split_data_args_path: Optional[str] = None,
-    mock: bool = False,
-) -> Tuple[_BLEND_TYPE, _BLEND_PER_SPLIT_TYPE, _SPLIT_TYPE]:
-    """
-    Common configuration logic for blend, blend_per_split, split dataset config fields.
-
-    Handles mock and real data. If no path to data is provided, mock data will be used.
-    Prioritizes `data_paths` over split data paths. For all of `data_paths`, `train_data_path`,
-    `valid_data_path`, and `test_data_path`, two formats are accepted: either (1) a list of prefixes,
-    e.g. ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"], or (2) a flattened, zipped
-    list of weights and prefixes, e.g. ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"]
-
-    Args:
-        data_paths (Optional[List[str]]): List of paths to dataset files.
-        data_args_path (Optional[str]): Path to file containing data arguments.
-        train_data_path (Optional[List[str]]): List of training data paths.
-        valid_data_path (Optional[List[str]]): List of validation data paths.
-        test_data_path (Optional[List[str]]): List of test data paths.
-        per_split_data_args_path (Optional[str]): Path to JSON file with per-split data configuration.
-        mock (bool): Whether to use mock data. If True, ignores data_paths.
-
-    Returns:
-        A tuple (blend, blend_per_split, split), the corresponding fields to be passed to GPTDatasetConfig.
-    """
-    has_any_data_config = any(
-        [data_paths, data_args_path, train_data_path, valid_data_path, test_data_path, per_split_data_args_path]
-    )
-
-    if mock or not has_any_data_config:
-        # Mock data configuration
-        blend = None  # Will trigger mock mode automatically
-        blend_per_split = None  # Will trigger mock mode automatically
-        split = "1,1,1"  # Equal splits for testing
-    else:
-        # Real data configuration
-        blend, blend_per_split = get_blend_and_blend_per_split(
-            data_paths=data_paths,
-            data_args_path=data_args_path,
-            train_data_paths=train_data_path,
-            valid_data_paths=valid_data_path,
-            test_data_paths=test_data_path,
-            per_split_data_args_path=per_split_data_args_path,
-        )
-
-        if blend_per_split is not None:
-            # When using blend_per_split, split should be None
-            split = None
-        elif blend is not None:
-            # When using regular blend, we can use split
-            split = "9999,8,2"
-        else:
-            # No data provided, fall back to mock mode
-            split = "1,1,1"
-
-    return blend, blend_per_split, split
-
-
-# ---------------------------------------------------------------------------
-# Unified dataset type registry
-# ---------------------------------------------------------------------------
-
-DATASET_TYPES = [
-    "llm-pretrain",
-    "llm-pretrain-mock",
-    "llm-finetune",
-    "llm-finetune-preloaded",
-    "vlm-energon",
-    "vlm-hf",
-    "vlm-preloaded",
-]
-
-LLM_FINETUNE_PRESETS: dict[str, Callable] = {
-    "squad": default_squad_config,
-    "openmathinstruct2": default_openmathinstruct2_config,
-    "gsm8k": default_gsm8k_config,
-}
-
-
-def extract_and_remove_override(cli_overrides: list[str], key: str, default: str | None = None) -> str | None:
-    """Extract a Hydra-style override (key=value) from *cli_overrides* and remove it.
-
-    Returns the value if found, otherwise *default*.
-    """
-    prefix = f"{key}="
-    for i, override in enumerate(cli_overrides):
-        if override.startswith(prefix):
-            value = override[len(prefix) :]
-            cli_overrides.pop(i)
-            return value
-    return default
-
-
-def _resolve_seq_length(config: ConfigContainer, seq_length: int | None) -> int:
-    """Resolve sequence length: explicit arg > model config > 4096 fallback."""
-    if seq_length is not None:
-        return seq_length
-    if hasattr(config, "model") and config.model is not None and hasattr(config.model, "seq_length"):
-        return config.model.seq_length
-    return 4096
-
-
-def apply_dataset_override(
-    config: ConfigContainer,
-    dataset_type: str,
-    packed_sequence: bool = False,
-    seq_length: int | None = None,
-    cli_overrides: list[str] | None = None,
-) -> ConfigContainer:
-    """Replace the recipe's dataset config based on the requested dataset type.
-
-    Args:
-        config: The recipe config to modify.
-        dataset_type: One of :data:`DATASET_TYPES`.
-        packed_sequence: Whether to enable packed sequences.
-        seq_length: Explicit sequence length (None = use model's or default 4096).
-        cli_overrides: Mutable list of Hydra-style CLI overrides. For ``llm-finetune``,
-            ``dataset.dataset_name`` is extracted and consumed here to select the preset.
-
-    Returns:
-        The modified ConfigContainer.
-    """
-    resolved_seq_length = _resolve_seq_length(config, seq_length)
-    if cli_overrides is None:
-        cli_overrides = []
-
-    if dataset_type == "llm-pretrain":
-        config.dataset = GPTDatasetConfig(
-            seq_length=resolved_seq_length,
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            num_dataset_builder_threads=1,
-            blend=None,
-            blend_per_split=None,
-            split="9999,8,2",
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        )
-
-    elif dataset_type == "llm-pretrain-mock":
-        config.dataset = MockGPTDatasetConfig(
-            seq_length=resolved_seq_length,
-            random_seed=1234,
-            reset_attention_mask=False,
-            reset_position_ids=False,
-            eod_mask_loss=False,
-            num_dataset_builder_threads=1,
-            split="9999,8,2",
-            data_sharding=True,
-            dataloader_type="single",
-            skip_getting_attention_mask_from_dataset=True,
-        )
-
-    elif dataset_type == "llm-finetune":
-        preset_name = extract_and_remove_override(cli_overrides, "dataset.dataset_name", default="squad")
-        if preset_name not in LLM_FINETUNE_PRESETS:
-            raise ValueError(
-                f"Unknown finetune dataset preset: '{preset_name}'. "
-                f"Choose from: {', '.join(sorted(LLM_FINETUNE_PRESETS.keys()))}"
-            )
-        factory = LLM_FINETUNE_PRESETS[preset_name]
-        kwargs: dict = {"packed_sequence": packed_sequence, "pad_seq_to_mult": 1}
-        kwargs["seq_length"] = resolved_seq_length
-        config.dataset = factory(**kwargs)
-
-    elif dataset_type == "llm-finetune-preloaded":
-        config.dataset = FinetuningDatasetConfig(
-            seq_length=resolved_seq_length,
-            dataset_root=None,
-            dataloader_type="batch",
-            seed=5678,
-        )
-
-    elif dataset_type == "vlm-energon":
-        if isinstance(config.dataset, EnergonProvider):
-            logger.info("Recipe already provides EnergonProvider; keeping it (preserves task_encoder).")
-        else:
-            logger.warning(
-                "Creating bare EnergonProvider. task_encoder and image_processor are unset; "
-                "use a recipe that provides them or set via code."
-            )
-            config.dataset = EnergonProvider(
-                path="",
-                seq_length=resolved_seq_length,
-                micro_batch_size=config.train.micro_batch_size,
-                global_batch_size=config.train.global_batch_size,
-                num_workers=2,
-            )
-
-    elif dataset_type == "vlm-hf":
-        config.dataset = HFDatasetConversationProvider(
-            seq_length=resolved_seq_length,
-            hf_processor_path=None,
-            maker_name="make_cord_v2_dataset",
-            num_workers=2,
-            dataloader_type="single",
-            data_sharding=True,
-            pin_memory=True,
-            persistent_workers=False,
-            pack_sequences_in_batch=False,
-        )
-
-    elif dataset_type == "vlm-preloaded":
-        config.dataset = PreloadedVLMConversationProvider(
-            seq_length=resolved_seq_length,
-            hf_processor_path=None,
-            train_data_path=None,
-            valid_data_path=None,
-            test_data_path=None,
-            dataloader_type="single",
-            num_workers=2,
-        )
-
-    else:
-        raise ValueError(f"Unknown dataset type: '{dataset_type}'. Choose from: {', '.join(DATASET_TYPES)}")
-
-    if seq_length is not None and hasattr(config, "model") and config.model is not None:
-        config.model.seq_length = seq_length
-
-    return config
-
-
-def infer_mode_from_dataset(dataset_type: str) -> str:
-    """Infer training mode from the dataset type prefix."""
-    if dataset_type.startswith("llm-pretrain"):
-        return "pretrain"
-    return "finetune"
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/datasets/sft.py
-```py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import logging
-import math
-import os
-import re
-from pathlib import Path
-from typing import Mapping
-
-import datasets
-import numpy as np
-import torch
-from datasets import load_dataset
-from megatron.core.msc_utils import MultiStorageClientFeature
-from torch.utils.data import Dataset
-
-from megatron.bridge.data.datasets.utils import (
-    _chat_preprocess,
-    _get_samples_mapping,
-    _JSONLMemMapDataset,
-    _OnlineSampleMapping,
-    _preprocess,
-    _tokenize,
-)
-from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
-
-
-DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
-NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
-DEFAULT_NEMO_DATASETS_CACHE = NEMO_CACHE_HOME / "datasets"
-NEMO_DATASETS_CACHE = Path(os.getenv("NEMO_DATASETS_CACHE", DEFAULT_NEMO_DATASETS_CACHE))
-DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models"
-NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
-
-if os.getenv("TOKENIZERS_PARALLELISM") is None:
-    os.putenv("TOKENIZERS_PARALLELISM", "True")
-
-logger = logging.getLogger(__name__)
-
-# hack to avoid the "not enough disk space" error in some slurm cluster
-datasets.builder.has_sufficient_disk_space = lambda needed_bytes, directory=".": True
-
-PREFIX_STR = (
-    "\x00"  # the prefix string used in the tokenizer to deal with the added empty token for some of the tokenizers
-)
-
-__idx_version__ = "0.2"  # index file version
-__idx_suffix__ = "idx"  # index file suffix
-
-
-def get_dataset_root(name: str) -> Path:
-    """
-    Returns the root directory for NeMo datasets, creating it if it doesn't exist.
-
-    Args:
-        name (str): The name of the dataset, used to create a subdirectory within the NeMo datasets cache.
-
-    Returns:
-        Path: The path to the dataset's root directory.
-    """
-    output = Path(NEMO_DATASETS_CACHE) / name
-    output.mkdir(parents=True, exist_ok=True)
-
-    return output
-
-
-def create_sft_dataset(
-    path: str | Path,
-    tokenizer: "MegatronTokenizer",
-    seq_length: int = 2048,
-    add_bos: bool = False,
-    add_eos: bool = True,
-    add_sep: bool = False,
-    seed: int = 1234,
-    label_key: str = "output",
-    answer_only_loss: bool = True,
-    truncation_field: str = "input",
-    pad_to_max_length: bool = False,
-    index_mapping_dir: str | None = None,
-    prompt_template: str = "{input} {output}",
-    truncation_method: str = "right",
-    memmap_workers: int = 2,
-    hf_dataset: bool = False,
-    global_sample_mapping: bool = False,
-    get_attention_mask_from_fusion: bool = True,
-    pack_metadata_file_path: Path | str | None = None,
-    pad_cu_seqlens: bool = False,
-    pad_seq_to_mult: int = 1,
-    chat: bool = False,
-    use_hf_tokenizer_chat_template: bool = False,
-    tool_schemas: str | dict | None = None,
-    **kwargs,
-) -> "GPTSFTDataset":
-    """
-    Creates and returns a supervised fine-tuning (SFT) dataset instance.
-
-    This function acts as a factory for different types of SFT datasets based on the
-    input parameters. It can create standard SFT datasets, chat-specific datasets,
-    or packed sequence datasets.
-
-    Dataset selection logic:
-    1. If path ends with .npy: GPTSFTPackedDataset (legacy packed format)
-    2. If path is a packed parquet spec (file/dir/glob ending in .parquet/.pq,
-       or a directory): GPTSFTPackedParquetDataset
-       - Note: Selection is based on path pattern, not pack_metadata_file_path
-       - Schema validation (REQUIRED_COLUMNS) will fast-fail for non-packed files
-    3. If chat=True: GPTSFTChatDataset
-    4. Otherwise: GPTSFTDataset
-
-    Args:
-        path (str | Path): Path to the dataset file or packed parquet spec (file/dir/glob).
-            For packed datasets, this can be a .npy file, a .parquet file, a directory
-            containing parquet files, or a glob pattern.
-        tokenizer (MegatronTokenizer): The tokenizer to use for tokenizing the data.
-        seq_length (int, optional): Maximum sequence length for each example. Defaults to 2048.
-        add_bos (bool, optional): Whether to add a beginning-of-sentence token. Defaults to False.
-        add_eos (bool, optional): Whether to add an end-of-sentence token. Defaults to True.
-        add_sep (bool, optional): Whether to add a separation token between prompt and answer. Defaults to False.
-        seed (int, optional): Random seed for data shuffling. Defaults to 1234.
-        label_key (str, optional): The key in the dataset corresponding to the label/output. Defaults to "output".
-        answer_only_loss (bool, optional): If True, compute loss only on the answer part. Defaults to True.
-        truncation_field (str, optional): Field(s) to truncate if the combined length exceeds `seq_length`.
-            Comma-separated if multiple. Defaults to "input".
-        pad_to_max_length (bool, optional): Whether to pad all samples to `max_seq_length`. Defaults to False.
-        index_mapping_dir (str | None, optional): Directory to store/load index mapping files. Defaults to None.
-        prompt_template (str, optional): F-string template for combining input fields.
-            Example: "{input} {output}". Defaults to "{input} {output}".
-        truncation_method (str, optional): Method for truncation ('left' or 'right'). Defaults to "right".
-        memmap_workers (int, optional): Number of workers for memory-mapped dataset loading. Defaults to 2.
-        hf_dataset (bool, optional): Whether to load the dataset using HuggingFace's `datasets` library.
-            Defaults to False.
-        global_sample_mapping (bool, optional): Whether to use a global sample mapping for shuffling across all data,
-            or shuffle within each epoch. Defaults to False.
-        get_attention_mask_from_fusion (bool): if true, lets attention kernel handle creation of causal mask instead
-            of adding it to the batch dict.
-        pack_metadata_file_path (Path | str | None, optional): Path to the metadata file for packed datasets.
-            When provided, enables packed mode. Required if `pad_cu_seqlens` is True. Defaults to None.
-        pad_cu_seqlens (bool, optional): Whether to pad `cu_seqlens` for packed datasets,
-            required for cudagraphs. Defaults to False.
-        chat (bool, optional): If True, creates a `GPTSFTChatDataset`. Defaults to False.
-        use_hf_tokenizer_chat_template (bool, optional): If True, uses HuggingFace tokenizer's chat template
-            via `apply_chat_template` method. Only applies when `chat=True`. Defaults to False.
-        tool_schemas (str | dict | None, optional): Tool schemas for function calling support.
-            Can be a JSON string or a dict. Only applies when `chat=True` and
-            `use_hf_tokenizer_chat_template=True`. Defaults to None.
-        **kwargs: Additional keyword arguments passed to the specific dataset class constructor.
-
-    Returns:
-        GPTSFTDataset | GPTSFTChatDataset | GPTSFTPackedDataset: An instance of the appropriate SFT dataset class.
-    """
-    # Normalize path to string for consistent handling
-    path_str = str(path)
-
-    gpt_sft_dataset_kwargs = {
-        "file_path": path_str,
-        "tokenizer": tokenizer,
-        "max_seq_length": seq_length,
-        "memmap_workers": memmap_workers,
-        "hf_dataset": hf_dataset,
-        "global_sample_mapping": global_sample_mapping,
-        "add_bos": add_bos,
-        "add_eos": add_eos,
-        "add_sep": add_sep,
-        "seed": seed,
-        "label_key": label_key,
-        "answer_only_loss": answer_only_loss,
-        "truncation_field": truncation_field,
-        "pad_to_max_length": pad_to_max_length,
-        "index_mapping_dir": index_mapping_dir,
-        "prompt_template": prompt_template,
-        "truncation_method": truncation_method,
-        "get_attention_mask_from_fusion": get_attention_mask_from_fusion,
-    }
-
-    # Check for .npy packed dataset (legacy format)
-    if path_str.lower().endswith(".npy"):
-        return GPTSFTPackedDataset(
-            pack_metadata_file_path=pack_metadata_file_path,
-            pad_cu_seqlens=pad_cu_seqlens,
-            pad_seq_to_mult=pad_seq_to_mult,
-            **gpt_sft_dataset_kwargs,
-            **kwargs,
-        )
-
-    # Lazy import to avoid circular dependency (packed_parquet imports from sft)
-    from megatron.bridge.data.datasets.packed_parquet import (
-        GPTSFTPackedParquetDataset,
-        is_packed_parquet_spec,
-    )
-
-    # Select GPTSFTPackedParquetDataset for any packed parquet spec (file/dir/glob)
-    # This is determined purely by path pattern, NOT by pack_metadata_file_path.
-    # Rationale:
-    # - Directory/glob specs clearly indicate packed parquet shards
-    # - Schema validation (REQUIRED_COLUMNS) will fast-fail if files aren't packed format
-    # - This allows externally-prepared packed data to work without requiring MB metadata
-    if is_packed_parquet_spec(path_str):
-        return GPTSFTPackedParquetDataset(
-            pack_metadata_file_path=pack_metadata_file_path,
-            pad_cu_seqlens=pad_cu_seqlens,
-            **gpt_sft_dataset_kwargs,
-            **kwargs,
-        )
-    elif chat:
-        return GPTSFTChatDataset(
-            **gpt_sft_dataset_kwargs,
-            use_hf_tokenizer_chat_template=use_hf_tokenizer_chat_template,
-            tool_schemas=tool_schemas,
-            **kwargs,
-        )
-    else:
-        return GPTSFTDataset(
-            **gpt_sft_dataset_kwargs,
-            **kwargs,
-        )
-
-
-class GPTSFTDataset(Dataset):
-    """ """
-
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: MegatronTokenizer,
-        max_seq_length: int = 1024,
-        min_seq_length: int = 1,
-        pad_seq_length_to_mult: int = 16,
-        add_bos: bool = False,
-        add_eos: bool = True,
-        add_sep: bool = False,
-        sep_id: int = None,
-        max_num_samples: int = None,
-        seed: int = 1234,
-        label_key: str = "answer",
-        answer_only_loss: bool = True,
-        truncation_field: str = "text",
-        pad_to_max_length: bool = False,  # (@adithyare) allows for much faster training especially in PEFT settings.
-        index_mapping_dir: str = None,
-        prompt_template: str = None,
-        virtual_tokens: int = 0,
-        tokens_to_generate: int = 0,
-        memmap_workers: int | None = None,
-        hf_dataset: bool = False,
-        global_sample_mapping: bool = False,
-        truncation_method: str = "right",
-        special_tokens: Mapping[str, str] | None = None,  # special tokens, a dictory of {token_type: token}
-        is_test: bool = False,
-        output_original_text: bool = False,
-        ceil_to_power_2: bool = False,
-        get_attention_mask_from_fusion: bool = True,
-    ):
-        """
-        file_path: Path to a JSONL GPT supervised fine-tuning dataset.
-            Data is formatted as multiple JSON lines with each line formatted as follows:
-            {
-                'input': 'John von Neumann\nVon Neumann made fundamental contributions ...
-                    Q: What did the math of artificial viscosity do?',
-                'output': 'smoothed the shock transition without sacrificing basic physics'
-            }
-        tokenizer: Tokenizer for the dataset. Instance of a class that inherits MegatronTokenizer (ex: SentencePiece).
-        max_seq_length (int): maximum sequence length for each dataset examples.
-            Examples will either be truncated to fit this length or dropped if they cannot be truncated.
-        min_seq_length (int): min length of each data example in the dataset.
-            Data examples will be dropped if they do not meet the min length requirements.
-        add_bos (bool): Whether to add a beginning of sentence token to each data example
-        add_eos (bool): Whether to add an end of sentence token to each data example
-        add_sep (bool): Whether to add a separation token to each data example (goes between prompt and answer)
-        tokens_to_generate (int): (inference only) Number of tokens to generate during inference
-        seed: Random seed for data shuffling.
-        max_num_samples: Maximum number of samples to load.
-            This can be > dataset length if you want to oversample data. If None, all samples will be loaded.
-        label_key: Key to use for the label in your JSONL file
-        answer_only_loss: If True, will compute the loss only on the answer part of the input.
-            If False, will compute the loss on the entire input.
-        truncation_field: Field to use for truncation. (Options: keys in prompt_template).
-            Field to be used for truncation if the combined length exceeds the max sequence length.
-        pad_to_max_length: Whether to pad the input to the max sequence length.
-            If False, will pad to the max length of the current batch.
-        index_mapping_dir: Directory to save the index mapping to.
-            If None, will write to the same folder as the dataset.
-        prompt_template: Prompt template to inject via an fstring.
-            Formatted like Q: {context_key}\n\nA: {label_key}
-        hf_dataset: Whether to load the json file with the HuggingFace dataset.
-            Otherwise, will load the jsonl file with the JSONLMemMapDataset.
-        global_sample_mapping: Whether to shuffle all data together, or shuffle the dataset within each epoch
-        truncation_method: Truncation from which position. Options: ['left', 'right']
-        special_tokens: special tokens for the chat prompts, a dictionary of {token_type: token}.
-            Default: {
-                'system_turn_start': '<extra_id_0>',
-                'turn_start': '<extra_id_1>',
-                'label_start': '<extra_id_2>',
-                'end_of_turn': '\n',
-                'end_of_name': '\n'
-            }
-        is_test: Whether this dataset is the test split.
-        output_original_text (bool): if true, will keep the original text in the output alongside the tokenized ids.
-        get_attention_mask_from_fusion (bool): if true, lets attention kernel handle creation of causal mask instead
-            of adding it to the batch dict.
-        """
-        self.tokenizer = tokenizer
-        self.file_path = file_path
-        self.max_seq_length = max_seq_length
-        self.min_seq_length = min_seq_length
-        self.pad_seq_length_to_mult = pad_seq_length_to_mult
-        self.add_bos = add_bos
-        self.add_eos = add_eos
-        self.add_sep = add_sep
-        self.sep_id = sep_id
-        self.max_num_samples = max_num_samples
-        self.seed = seed
-        self.label_key = label_key
-        self.answer_only_loss = answer_only_loss
-        self.truncation_fields = truncation_field.split(",") if truncation_field is not None else []
-        self.pad_to_max_length = pad_to_max_length
-        self.index_mapping_dir = index_mapping_dir
-        self.prompt_template = prompt_template
-        self.virtual_tokens = virtual_tokens
-        self.tokens_to_generate = tokens_to_generate
-        self.memmap_workers = memmap_workers
-        self.hf_dataset = hf_dataset
-        self.global_sample_mapping = global_sample_mapping
-        self.truncation_method = truncation_method
-        self.is_test = is_test
-        self.output_original_text = output_original_text
-        self.ceil_to_power_2 = ceil_to_power_2
-        self.get_attention_mask_from_fusion = get_attention_mask_from_fusion
-
-        if special_tokens is None:
-            self.special_tokens = {
-                "system_turn_start": "<extra_id_0>",
-                "turn_start": "<extra_id_1>",
-                "label_start": "<extra_id_2>",
-                "end_of_turn": "\n",
-                "end_of_name": "\n",
-            }
-        else:
-            self.special_tokens = special_tokens
-
-        self._load_dataset()
-
-        # Validate prompt template
-        self._maybe_validate_prompt_template()
-
-        # Will be None after this call if `max_num_samples` is None
-        self._build_samples_mapping()
-
-    def _load_dataset(self):
-        if self.hf_dataset:
-            self.indexed_dataset = load_dataset(
-                "json",
-                data_files=self.file_path,
-                cache_dir=self.index_mapping_dir,
-                num_proc=self.memmap_workers,
-                split="train",
-            )
-        else:
-            self.indexed_dataset = _JSONLMemMapDataset(
-                dataset_paths=[self.file_path],
-                tokenizer=None,
-                header_lines=0,
-                index_mapping_dir=self.index_mapping_dir,
-                workers=self.memmap_workers,
-            )
-
-    def _maybe_validate_prompt_template(self):
-        assert self.prompt_template is not None, (
-            f"we need prompt_template to combine contexts and label {self.label_key}"
-        )
-        # When providing things like newlines in the prompt template via the CLI, they are escaped.
-        # This line unescapes them.
-        self.prompt_template = self.prompt_template.encode("utf-8").decode("unicode_escape")
-        self.prompt_template_keys = re.findall(r"{(.*?)}", self.prompt_template)
-
-        label_placeholder = f"{{{self.label_key}}}"
-        assert self.prompt_template[-len(label_placeholder) :] == label_placeholder, (
-            f"{label_placeholder} must be at the end of prompt_template."
-        )
-
-        # Legacy checkpoints has self.truncation_fields = ['context']
-        # and self.prompt_template_keys = ['input', 'output']
-        if len(self.truncation_fields) > 0:
-            if self.prompt_template_keys[0] == "input" and self.truncation_fields[0] == "context":
-                self.truncation_fields[0] = self.prompt_template_keys[0]
-
-        assert set(self.truncation_fields).issubset(self.prompt_template_keys), (
-            f"truncation_fields {self.truncation_fields} must in {self.prompt_template_keys}"
-        )
-
-    def _build_samples_mapping(self):
-        if self.max_num_samples is not None:
-            osm = (
-                _OnlineSampleMapping(dataset_size=len(self.indexed_dataset), num_samples=self.max_num_samples)
-                if not self.global_sample_mapping
-                else None
-            )
-            self.samples_mapping = _get_samples_mapping(
-                indexed_dataset=self.indexed_dataset,
-                data_prefix=self.file_path,
-                num_epochs=None,
-                max_num_samples=self.max_num_samples,
-                max_seq_length=self.max_seq_length - 2,
-                short_seq_prob=0,
-                seed=self.seed,
-                name=self.file_path.split("/")[-1],
-                binary_head=False,
-                index_mapping_dir=self.index_mapping_dir,
-                samples_mapping=osm,
-            )
-        else:
-            self.samples_mapping = None
-
-    def __len__(self):
-        """Return the total number of samples in this dataset."""
-        if self.max_num_samples is None:
-            return len(self.indexed_dataset)
-        else:
-            return len(self.samples_mapping)
-
-    def __getitem__(self, idx):
-        if isinstance(idx, np.int64):
-            idx = idx.item()
-
-        if self.samples_mapping is not None:
-            assert idx < len(self.samples_mapping)
-            idx, _, _ = self.samples_mapping[idx]
-            if isinstance(idx, (np.uint32, np.int64)):
-                idx = idx.item()
-
-        assert idx < len(self.indexed_dataset)
-        # idx may < 0 because we pad_samples_to_global_batch_size, e.g. id = -1
-        if idx < 0:
-            idx = len(self) + idx
-            auto_gen_idx = True
-        else:
-            auto_gen_idx = False
-        try:
-            example = self.indexed_dataset[idx]
-            if auto_gen_idx:
-                example["__AUTOGENERATED__"] = True
-        except Exception as e:
-            logger.error(f"Error while loading example {idx} from dataset {self.file_path}")
-            raise e
-        return self._process_example(example)
-
-    def _separate_template(self, prompt_template_values: list[str]):
-        """
-        Combine contexts and label based on prompt_template into a list of strings and a list of keys.
-
-        Args:
-            prompt_template_values (list[str]): the list of context and label strings
-                extrated from jsonl file with prompt_template_keys.
-
-        Returns:
-            template_strings (list[str]): separated prompt_template with contexts/label
-                placeholder filled with corresponding strings
-            template_strings_keys (list[str]): strings point to placeholder keys or <template>
-
-        Examples:
-            prompt_template = 'Context:  {context} Question: {question} Answer: {label}'
-            prompt_template_values = ['xxx', 'yyy', 'zzz']
-
-            # tokenizer.space_sensitive = True
-            template_strings = ['Context:', '  xxx', ' Question:', ' yyy', ' Answer:', ' zzz']
-
-            # tokenizer.space_sensitive = False
-            template_strings = ['Context:', ' xxx', 'Question:', 'yyy', 'Answer:', 'zzz']
-
-            template_strings_keys = ['<template>', 'context', '<template>', 'question', '<template>', 'label']
-        """
-        placeholders = [f"{{{k}}}" for k in self.prompt_template_keys]
-
-        # placeholder to string
-        ph_to_s = {ph: s for ph, s in zip(placeholders, prompt_template_values)}
-        # placeholder to key
-        ph_to_k = {ph: k for ph, k in zip(placeholders, self.prompt_template_keys)}
-
-        # separate prompt_template based on '<space>{placeholder}'
-        # examples:
-        #   self.prompt_template = "Context:{context}  Passage: {passage}\n\nQuestion:{question} {label}"
-        #   template_with_placeholder_separated = [
-        #       'Context:', '{context}', '  Passage:', ' {passage}', '\n\nQuestion:', '{question}', ' {label}'
-        #   ]
-        template_with_placeholder_separated = re.split("( *?{.+?})", self.prompt_template)
-        template_with_placeholder_separated = [s for s in template_with_placeholder_separated if len(s) > 0]
-
-        # remove space if we have leading space and tokenizer is not space_sensitive
-        # space_sensitive = True : tokenizer.text_to_tokens('A{num_spaces}B') = (
-        #   tokenizer.text_to_tokens('A') + tokenizer.text_to_tokens('{num_spaces}B'
-        # )
-        # space_sensitive = False: tokenizer.text_to_tokens('A{num_spaces}B') = (
-        # tokenizer.text_to_tokens('A') + tokenizer.text_to_tokens('{num_spaces-1}B'
-        # )
-        space_sensitive = getattr(self.tokenizer, "space_sensitive", False)
-        template_with_space_reduced = [
-            s[1:] if not space_sensitive and s[0] == " " else s for s in template_with_placeholder_separated
-        ]
-
-        # convert placeholder to the corresponding string (preserve left spaces) and key
-        template_strings, template_strings_keys = [], []
-        for t in template_with_space_reduced:
-            placeholder = t.lstrip(" ")
-            left_spaces = " " * (len(t) - len(placeholder))
-            template_strings.append(left_spaces + ph_to_s.get(placeholder, placeholder))
-            template_strings_keys.append(ph_to_k.get(placeholder, "<template>"))
-
-        return template_strings, template_strings_keys
-
-    def _multiple_truncation(self, template_ids: list[list[int]], template_ids_keys: list[str]):
-        """
-        Calculate total tokens and truncate multiple contexts in truncation_fields.
-
-        Args:
-            template_ids (list[list[int]]): the list of separate prompt_template ids.
-            template_ids_keys (list[str]): the list of placeholder keys or <template>
-                (used to check key in truncation_fields).
-
-        Returns:
-            context_ids (list[int]): all context ids.
-            label_ids (list[int]): all label ids.
-        """
-        context_ids = template_ids[:-1]
-        label_ids = template_ids[-1]
-        total_ids = (
-            self.virtual_tokens
-            + sum(len(ids) for ids in context_ids)
-            + max(len(label_ids), self.tokens_to_generate)
-            + self.add_bos
-            + self.add_sep
-            + self.add_eos  # Only training need to consider eos token
-        )
-
-        if total_ids > self.max_seq_length:
-            truncation_length_total = total_ids - self.max_seq_length
-            num_fields = len(self.truncation_fields)
-            if num_fields > 0:
-                # sorted equal divide length to each field
-                # examples:
-                #   truncation_length_total = 3
-                #   num_fields = 11
-                #   truncation_length_list = [3,4,4]
-                truncation_length_list = [
-                    truncation_length_total // num_fields + (1 if i < truncation_length_total % num_fields else 0)
-                    for i in range(num_fields)[::-1]
-                ]
-
-                for i, (ids, key) in enumerate(zip(template_ids, template_ids_keys)):
-                    if key in self.truncation_fields:
-                        truncation_length = truncation_length_list.pop()
-                        if len(ids) < truncation_length:
-                            logger.warning(f"{key} is not long enough to truncate.")
-                            truncation_length = len(ids)
-
-                        truncation_length_total -= truncation_length
-                        template_ids[i] = self._truncation(ids, len(ids) - truncation_length)
-
-            if truncation_length_total > 0:
-                template_ids_lengths = [len(ids) for ids in template_ids]
-                if self.truncation_method == "left":
-                    iters = range(0, len(template_ids_lengths), 1)
-                elif self.truncation_method == "right":
-                    iters = range(len(template_ids_lengths) - 1, -1, -1)
-                    # We need to truncate more to let context_ids + tokens_to_generate < self.max_seq_length
-                    truncation_length_total += min(len(label_ids), self.tokens_to_generate)
-                else:
-                    raise ValueError(f"{self.truncation_method} is not supported")
-
-                # Iterate all lengths of template_ids.
-                for i in iters:
-                    if template_ids_lengths[i] >= truncation_length_total:
-                        template_ids_lengths[i] -= truncation_length_total
-                        template_ids[i] = self._truncation(template_ids[i], template_ids_lengths[i])
-                        break
-                    else:
-                        truncation_length_total -= template_ids_lengths[i]
-                        template_ids_lengths[i] = 0
-                        template_ids[i] = self._truncation(template_ids[i], template_ids_lengths[i])
-
-        context_ids = [i for ids in template_ids[:-1] for i in ids]
-        label_ids = template_ids[-1]
-        return context_ids, label_ids
-
-    def _truncation(self, ids, expect_length):
-        if expect_length == 0:
-            return []
-        elif self.truncation_method == "left":
-            return ids[-expect_length:]
-        elif self.truncation_method == "right":
-            return ids[:expect_length]
-        else:
-            raise ValueError(f"{self.truncation_method} is not supported")
-
-    def _process_example(self, example):
-        """
-        Create an example by concatenating text and answer.
-        Truncation is carried out when needed, but it is performed only on the prompt side.
-        BOS, EOS, and SEP, are added if specified.
-        """
-        prompt_template_values = []
-        for c in self.prompt_template_keys:
-            try:
-                prompt_template_values.append(example[c].strip(" "))
-            except KeyError as e:
-                if c == self.label_key and self.is_test:
-                    # allow missing label during testing,
-                    # if user only wants to do inference without calculating metrics
-                    prompt_template_values.append("")
-                else:
-                    raise e
-
-        template_strings, template_strings_keys = self._separate_template(prompt_template_values)
-        template_ids = [_tokenize(self.tokenizer, s) for s in template_strings]
-        context_ids, answer_ids = self._multiple_truncation(template_ids, template_strings_keys)
-
-        if self.virtual_tokens:
-            # (@adithyare) we are going to insert "pad/eos" tokens in the beginning of the text and context
-            # these pad/eos tokens are placeholders for virtual tokens
-            context_ids = [self.tokenizer.eos_id] * self.virtual_tokens + context_ids
-
-        # Adds bos token in the start
-        if self.add_bos:
-            context_ids = [self.tokenizer.bos_id] + context_ids
-
-        # Adds sep token between text/prompt and answer
-        if self.add_sep:
-            context_ids = context_ids + [self.sep_id]
-
-        input_ids = context_ids + answer_ids
-
-        # Only training need to consider eos token
-        if self.add_eos:
-            input_ids = input_ids + [self.tokenizer.eos_id]
-
-        # store metadata in dataset, in case user may have keys required in the prediction json files
-        metadata = {k: v for k, v in example.items() if k not in self.prompt_template_keys}
-        if self.output_original_text:
-            for orig_text, text_key in zip(template_strings, template_strings_keys):
-                metadata[text_key] = orig_text
-
-        processed_example = {
-            "input_ids": input_ids,
-            "answer_start_idx": len(context_ids),
-            "context_ids": context_ids,
-            "context_length": len(context_ids),
-            "answer_ids": answer_ids,
-            "metadata": metadata,
-            "token_count": len(input_ids),
-        }
-
-        return processed_example
-
-    def _maybe_cast_to_list(self, x):
-        if isinstance(x, np.ndarray):
-            return [item.tolist() for item in x]
-        return x
-
-    def _ceil_to_nearest(self, n, m):
-        if self.ceil_to_power_2:
-            # Reccurent Gemma (AKA Griffin) requires seq length to be a power of 2 for parallel scan
-            return 2 ** math.ceil(math.log2(n))
-        else:
-            return (n + m - 1) // m * m
-
-    def _collate_item(self, item, max_length, pad_id):
-        item = self._maybe_cast_to_list(item)
-        # max_length = max([len(x) for x in item]) if item else 0
-        # here [0] should be tokenizer.pad_id
-        item = [x + [pad_id] * (max_length - len(x)) for x in item]
-        return item
-
-    def _build_loss_mask(self, processed_example):
-        """Pad input_ids in batch to max batch length while building loss mask"""
-        input_ids = processed_example["input_ids"]
-        answer_start_idx = processed_example["answer_start_idx"]
-        if self.answer_only_loss:
-            loss_mask = [float(idx >= answer_start_idx) for idx in range(len(input_ids))]
-        else:
-            loss_mask = [1.0] * len(input_ids)
-
-        return loss_mask
-
-    @torch.no_grad()
-    def _create_attention_mask(self, max_length):
-        """Creates an upper-triangular causal attention mask.
-        Args:
-            input_ids: A 1D tensor that holds the indices of tokens.
-        """
-        # seq_length = len(input_ids)
-        # `attention_mask` has the shape of [1, seq_length, seq_length]
-        attention_mask = torch.tril(torch.ones((max_length, max_length))).unsqueeze(0)
-        attention_mask = attention_mask < 0.5
-        return attention_mask
-
-    def collate_fn(self, batch):
-        """
-        Collate a list of samples into a batch dictionary for model training or evaluation.
-
-        This function takes a list of individual processed samples (from `__getitem__`)
-        and groups them into a batch. It handles padding of sequences to the maximum
-        length found in the batch (or `self.max_seq_length` if `pad_to_max_length` is True),
-        and prepares all necessary tensors for the model.
-
-        Args:
-            batch (List[dict]): A list of dictionaries, where each dictionary is a
-                                sample processed by `_process_example`.
-
-        Returns:
-            dict: A dictionary of batched tensors ready for model input. Key tensors include
-                  'tokens', 'labels', 'loss_mask', 'position_ids', and 'attention_mask'.
-        """
-        input_ids = [item["input_ids"][:-1] for item in batch]
-        labels = [item["input_ids"][1:] for item in batch]
-        contexts = [item["context_ids"] for item in batch]
-        context_lengths = torch.LongTensor([item["context_length"] for item in batch])
-        answers = [item["answer_ids"] for item in batch]
-        loss_mask = [self._build_loss_mask(item)[1:] for item in batch]
-        metadata = [item["metadata"] for item in batch]
-        token_count = [item["token_count"] for item in batch]
-
-        max_length = max(max([len(x) for x in input_ids]), max([len(x) for x in contexts]) + self.tokens_to_generate)
-        # increase max length to nearest multiple of 4 or 8
-        if self.pad_to_max_length:
-            max_length = self.max_seq_length
-        else:
-            max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, self.pad_seq_length_to_mult))
-        assert max_length <= self.max_seq_length
-
-        if not self.get_attention_mask_from_fusion:
-            attention_mask = [self._create_attention_mask(max_length) for _ in batch]
-            attention_mask = torch.stack(attention_mask)
-        else:
-            attention_mask = None
-        position_ids = [list(range(max_length)) for _ in batch]
-        position_ids = torch.LongTensor(position_ids)
-        input_ids = torch.LongTensor(
-            self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
-        )
-        labels = torch.LongTensor(self._collate_item(labels, max_length=max_length, pad_id=self.tokenizer.eos_id))
-        loss_mask = torch.LongTensor(self._collate_item(loss_mask, max_length=max_length, pad_id=0))
-        contexts = torch.LongTensor(self._collate_item(contexts, max_length=max_length, pad_id=self.tokenizer.eos_id))
-        answers = torch.LongTensor(self._collate_item(answers, max_length=max_length, pad_id=self.tokenizer.eos_id))
-
-        processed_batch = {
-            "tokens": input_ids,
-            "labels": labels,
-            "loss_mask": loss_mask,
-            "position_ids": position_ids,
-            "contexts": contexts,
-            "context_lengths": context_lengths,
-            "answers": answers,
-            "metadata": metadata,
-            "token_count": token_count,
-            "attention_mask": attention_mask,
-        }
-
-        return processed_batch
-
-
-class GPTSFTPackedDataset(GPTSFTDataset):
-    """ """
-
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: MegatronTokenizer,
-        return_cu_seqlen: bool = True,
-        pad_cu_seqlens: bool = False,
-        pad_seq_to_mult: int = 1,
-        pack_metadata_file_path: str | None = None,
-        **kwargs,
-    ):
-        """
-        file_path: See `file_path` in the parent class.
-        tokenizer: See `tokenizer` in the parent class.
-        return_cu_seqlen: Whether to return `cu_seqlen` to pass to the model. Having `cu_seqlen` in the model input
-                enables THD attention kernel, which is the correct format for training with packed sequence to prevent
-                cross-sequence attention. This flag should be True unless you have a specific use case.
-        pad_seq_to_mult: The multiple used for padding sequences during packing. When > 1, cu_seqlens_unpadded
-                will be computed to support THD CP. When == 1 (no padding), cu_seqlens_unpadded is not computed.
-        """
-        np.random.seed(kwargs.get("seed", 1234))
-        super().__init__(file_path, tokenizer, **kwargs)
-        assert self.virtual_tokens == 0, "P-Tuning with packed sequence is not supported."
-        self.return_cu_seqlen = return_cu_seqlen
-        self._pad_seq_to_mult = pad_seq_to_mult
-
-        self.pad_cu_seqlens = pad_cu_seqlens
-        if self.pad_cu_seqlens:
-            assert pack_metadata_file_path is not None, (
-                "a metadata json file is required when pad_cu_seqlens is enabled"
-            )
-            assert self.pad_to_max_length is True, (
-                "'pad_to_max_length=True' is required when pad_cu_seqlens is enabled"
-            )
-
-        self.pack_metadata = None
-        if pack_metadata_file_path is not None:
-            if MultiStorageClientFeature.is_enabled():
-                msc = MultiStorageClientFeature.import_package()
-                with msc.open(str(pack_metadata_file_path), "r") as f:
-                    self.pack_metadata = json.load(f)
-            else:
-                with open(pack_metadata_file_path) as f:
-                    self.pack_metadata = json.load(f)
-
-    def __getitem__(self, idx):
-        if self.samples_mapping is not None:
-            # assert idx < len(self.samples_mapping)
-            idx = self.samples_mapping[idx]
-
-        input_ids = self.indexed_dataset[idx]["input_ids"]
-        seq_boundaries = self.indexed_dataset[idx]["seq_start_id"] + [len(input_ids)]
-        loss_mask = self.indexed_dataset[idx]["loss_mask"]
-        if idx < 0:
-            loss_mask = [0] * len(loss_mask)
-        return {"input_ids": input_ids, "seq_boundaries": seq_boundaries, "loss_mask": loss_mask}
-
-    def _load_dataset(self):
-        try:
-            if MultiStorageClientFeature.is_enabled():
-                msc = MultiStorageClientFeature.import_package()
-                self.indexed_dataset = msc.numpy.load(self.file_path, allow_pickle=True)
-            else:
-                self.indexed_dataset = np.load(self.file_path, allow_pickle=True)
-        except Exception as e:
-            logger.error(
-                f"Failed to load packed dataset. The dataset should be a `.npy` file. "
-                f"Please check if the packed dataset was prepared correctly. The original error was:\n {e}",
-            )
-            exit(1)
-
-    def _build_samples_mapping(self):
-        if self.max_num_samples is not None:
-            # custom samples mapping logic, following the format for unpacked sft dataset
-            # Note: this is epoch-level shuffling, i.e. sampling without replacement until end of epoch, then repeat.
-            # Unpacked dataset shuffles by sampling with replacement indefinitely.
-            dataset_len = len(self.indexed_dataset)
-            max_num_epochs = np.ceil(self.max_num_samples / dataset_len)
-            indices = np.arange(dataset_len)[None, :].repeat(max_num_epochs, axis=0)
-            [np.random.shuffle(x) for x in indices]
-            self.samples_mapping = indices.reshape(1, -1).squeeze()[: self.max_num_samples]
-        else:
-            self.samples_mapping = None
-
-    def _build_loss_mask(self, processed_example):
-        seq_boundaries = processed_example["seq_boundaries"]
-        if self.answer_only_loss:
-            return np.concatenate(
-                [
-                    processed_example["loss_mask"][seq_boundaries[i] : seq_boundaries[i + 1] - 1]
-                    for i in range(len(seq_boundaries) - 1)
-                ]
-            )
-        return np.concatenate(
-            [
-                [
-                    0 if x == self.tokenizer.eos_id else 1.0
-                    for x in processed_example["input_ids"][seq_boundaries[i] : seq_boundaries[i + 1] - 1]
-                ]
-                for i in range(len(seq_boundaries) - 1)
-            ]
-        )
-
-    def _maybe_cast_to_list(self, x):
-        return [item.tolist() if isinstance(item, np.ndarray) else item for item in x]
-
-    def collate_fn(self, batch):
-        """
-        Collates a list of packed sequence samples into a batch for the model.
-
-        This method is specifically designed for `GPTSFTPackedDataset`. It takes a list
-        of packed sequence items (as returned by `__getitem__`) and prepares a batch
-        of tensors. This includes handling `cu_seqlens` which are crucial for the
-        efficient processing of packed sequences with kernels like THD attention.
-
-        Args:
-            batch (List[dict]): A list of packed sequence samples.
-
-        Returns:
-            dict: A dictionary of batched tensors, including 'tokens', 'labels',
-                  'loss_mask', 'position_ids', and potentially 'cu_seqlens',
-                  'cu_seqlens_argmin', 'max_seqlen' if `return_cu_seqlen` is True.
-        """
-        input_ids = [
-            np.concatenate(
-                [
-                    item["input_ids"][item["seq_boundaries"][i] : item["seq_boundaries"][i + 1] - 1]
-                    for i in range(len(item["seq_boundaries"]) - 1)
-                ]
-            )
-            for item in batch
-        ]
-        labels = [
-            np.concatenate(
-                [
-                    item["input_ids"][item["seq_boundaries"][i] + 1 : item["seq_boundaries"][i + 1]]
-                    for i in range(len(item["seq_boundaries"]) - 1)
-                ]
-            )
-            for item in batch
-        ]
-
-        loss_mask = [self._build_loss_mask(item) for item in batch]
-
-        token_count = [item.shape[0] for item in input_ids]
-
-        if self.pad_to_max_length:
-            max_length = self.max_seq_length
-        else:
-            # pad to the nearest multiple of 16 for FP8 training
-            # for many datasets in practice, all packed sequence lengths are very close to the
-            # target length (2048, 4096, 8192), so there is very minimal padding
-            max_length = max(len(length) for length in input_ids)
-            max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, self.pad_seq_length_to_mult))
-        assert max_length <= self.max_seq_length
-
-        position_ids: list[list[int]] = []
-        cu_seqlens: list[list[int]] = []
-        # Only compute cu_seqlens_unpadded when pad_seq_to_mult > 1 (actual padding for CP)
-        cu_seqlens_unpadded: list[list[int]] | None = [] if self._pad_seq_to_mult > 1 else None
-        for item in batch:
-            position_ids.append([])
-            cu_seqlens.append([0])
-            if cu_seqlens_unpadded is not None:
-                cu_seqlens_unpadded.append([0])
-            seqlens = np.array(item["seq_boundaries"][1:]) - np.array(item["seq_boundaries"][:-1])
-            for length in seqlens:
-                # length minus 1 because input_ids is truncated by 1 for labels
-                position_ids[-1].extend(list(range(length - 1)))
-                cu_seqlens[-1].append(cu_seqlens[-1][-1] + length - 1)
-
-            # the last seq needs to be the max seq len because rope and attn kernels expect no padding
-            assert cu_seqlens[-1][-1] <= max_length
-
-            # since data is prepadded when cp_size > 1, there may be some extra padding at the end
-            # of the packed sequence. In this case, we need to add the max seq len to the end.
-            if cu_seqlens[-1][-1] != max_length:
-                cu_seqlens[-1].append(max_length)
-
-            if cu_seqlens_unpadded is not None:
-                for i in range(len(item["seq_boundaries"]) - 1):
-                    current_seq = item["input_ids"][item["seq_boundaries"][i] : item["seq_boundaries"][i + 1] - 1]
-
-                    # Stop unpadded lengths at the last non-eos token so padding eos are excluded.
-                    current_seq_arr = np.array(current_seq)
-                    non_eos_positions = np.where(current_seq_arr != self.tokenizer.eos_id)[0]
-                    seqlen_unpadded = non_eos_positions[-1] + 1 if non_eos_positions.size > 0 else 0
-                    cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1] + seqlen_unpadded)
-
-                # if extra paddings are added in the packed sequence, they can't be counted as
-                # actual tokens for training
-                if len(cu_seqlens[-1]) > len(cu_seqlens_unpadded[-1]):
-                    cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1])
-
-            if self.pad_cu_seqlens:
-                # pad cu_seqlens to a constant shape with zero length sequences
-                max_samples_per_bin = max(p["max_samples_per_bin"] for p in self.pack_metadata)
-                # plus 2 since cu_seqlens additionally contains 0 and may append max_length
-                pad_num = max_samples_per_bin - len(cu_seqlens[-1]) + 2
-                cu_seqlens[-1].extend([max_length] * pad_num)
-
-        assert len(input_ids[0]) == len(position_ids[0]), (
-            "Dataset problem: input_ids and position_ids lengths don't match"
-        )
-
-        input_ids = self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
-        labels = self._collate_item(labels, max_length=max_length, pad_id=self.tokenizer.eos_id)
-        loss_mask = self._collate_item(loss_mask, max_length=max_length, pad_id=0)
-        position_ids = self._collate_item(position_ids, max_length=max_length, pad_id=0)
-
-        tokens = torch.LongTensor(input_ids)
-        loss_mask = torch.LongTensor(loss_mask)
-        # drop any padding/eos tokens from contributing to the loss
-        loss_mask[tokens == self.tokenizer.eos_id] = 0
-
-        processed_batch = {
-            "tokens": tokens,
-            "labels": torch.LongTensor(labels),
-            "loss_mask": loss_mask,
-            "position_ids": torch.LongTensor(position_ids),
-            "token_count": token_count,
-        }
-
-        if self.return_cu_seqlen:
-            cu_seqlens = self._collate_item(
-                cu_seqlens, max_length=max(len(length) for length in cu_seqlens) + 1, pad_id=-1
-            )
-            # Pre-generate `cu_seqlens_argmin` and `max_seqlen` as CPU tensor to avoid device-to-host copies.
-            cu_seqlens = torch.IntTensor(cu_seqlens)
-            cu_seqlens_argmin = torch.argmin(cu_seqlens, dim=1, keepdim=True)
-            seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
-            max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
-
-            if self.pad_cu_seqlens:
-                # If padding, use the global max seqlen, so that 'pad_cu_seqlens' is the same
-                # across all batches. This is maintly used compatiblity with megatron's implementation
-                # of cudagraphs, which uses the same cudagraphs over all batches.
-                dataset_max_seqlen = max(p["dataset_max_seqlen"] for p in self.pack_metadata)
-                min_pack_seq_len = min(p["min_packed_seqlen"] for p in self.pack_metadata)
-                padding_gap = max_length - min_pack_seq_len
-
-                # Use the larger of the two values to avoid NaN issues with attention kernel
-                safe_max_seqlen = max(dataset_max_seqlen, padding_gap)
-                max_seqlen = torch.IntTensor([safe_max_seqlen] * len(cu_seqlens))
-            else:
-                seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
-                max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
-
-            cu_seqlens_batch = {
-                "attention_mask": torch.LongTensor([1] * len(input_ids)),  # no attention mask is needed for packed seq
-                "cu_seqlens": torch.IntTensor(cu_seqlens),  # cu_seqlens_q must be in dtype torch.int32
-                "cu_seqlens_argmin": cu_seqlens_argmin,  # only required for perf
-                "max_seqlen": max_seqlen,  # only required for perf
-            }
-
-            # Only include cu_seqlens_unpadded when pad_seq_to_mult > 1 (actual CP padding)
-            if cu_seqlens_unpadded is not None:
-                cu_seqlens_unpadded = self._collate_item(
-                    cu_seqlens_unpadded, max_length=max(len(length) for length in cu_seqlens_unpadded) + 1, pad_id=-1
-                )
-                cu_seqlens_unpadded = torch.IntTensor(cu_seqlens_unpadded)
-                cu_seqlens_unpadded_argmin = torch.argmin(cu_seqlens_unpadded, dim=1, keepdim=True)
-                cu_seqlens_batch["cu_seqlens_unpadded"] = cu_seqlens_unpadded
-                cu_seqlens_batch["cu_seqlens_unpadded_argmin"] = cu_seqlens_unpadded_argmin
-
-            processed_batch.update(cu_seqlens_batch)
-        else:
-            attention_mask = [self._create_attention_mask(max_length) for _ in batch]
-            processed_batch.update(
-                {
-                    "attention_mask": torch.stack(attention_mask),
-                }
-            )
-
-        return processed_batch
-
-
-class GPTSFTChatDataset(GPTSFTDataset):
-    """Dataset class for chat-based fine-tuning with optional HuggingFace chat template support.
-
-    Supports both legacy special token-based formatting and modern HuggingFace chat templates.
-    """
-
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: MegatronTokenizer,
-        use_hf_tokenizer_chat_template: bool = False,
-        tool_schemas: str | dict | None = None,
-        **kwargs,
-    ):
-        """
-        Initialize GPTSFTChatDataset with optional HuggingFace chat template support.
-
-        Accepts conversational data in ShareGPT format. If use_hf_tokenizer_chat_template is True, the dataset will
-        accept both ShareGPT and HuggingFace chat template format. In the case of ShareGPT format, it will try to convert
-        to HuggingFace format.
-
-        ShareGPT format:
-        {"conversations": [{"value": "...", "from": "User"}, {"value": "...", "from": "Assistant"}]}
-
-        HuggingFace chat template format:
-        {
-            "messages": [
-                {"role": "system", "content": "..."}, {"role": "user", "content": "..."},
-                {"role": "assistant", "content": "..."}
-            ]
-        }
-
-        Args:
-            file_path: Path to the dataset file
-            tokenizer: Tokenizer instance
-            use_hf_tokenizer_chat_template: If True, use HuggingFace tokenizer's apply_chat_template
-            tool_schemas: Tool schemas for function calling (JSON string or dict)
-            **kwargs: Additional arguments passed to parent GPTSFTDataset
-        """
-        self.use_hf_tokenizer_chat_template = use_hf_tokenizer_chat_template
-        self.tool_schemas = tool_schemas
-
-        # Parse tool_schemas if it's a JSON string
-        if isinstance(self.tool_schemas, str):
-            self.tool_schemas = json.loads(self.tool_schemas)
-
-        # Initialize parent class
-        super().__init__(file_path, tokenizer, **kwargs)
-
-        # Validate tokenizer if using HF chat template
-        if self.use_hf_tokenizer_chat_template:
-            if not hasattr(self.tokenizer, "_tokenizer") or not hasattr(
-                self.tokenizer._tokenizer, "apply_chat_template"
-            ):
-                raise ValueError(
-                    "Dataset configured to use HF tokenizer chat template, but tokenizer does not have "
-                    "apply_chat_template method. Please ensure you're using a HuggingFace tokenizer with "
-                    "a chat template defined."
-                )
-
-    def _maybe_validate_prompt_template(self):
-        pass
-
-    def _build_samples_mapping(self):
-        super()._build_samples_mapping()
-
-        # Only build special token IDs if not using HF chat template
-        if not self.use_hf_tokenizer_chat_template:
-            LABEL_START = self.special_tokens["label_start"]
-            END_NAME_SIGNAL = self.special_tokens["end_of_name"]
-
-            id1 = _tokenize(self.tokenizer, PREFIX_STR)
-            id2 = _tokenize(self.tokenizer, PREFIX_STR + LABEL_START)
-            self.label_start_tokens = id2[len(id1) :]
-
-            id1 = _tokenize(self.tokenizer, PREFIX_STR + END_NAME_SIGNAL)
-            id2 = _tokenize(self.tokenizer, PREFIX_STR)
-            self.name_end_token_ids = id1[len(id2) :]
-
-            id1 = _tokenize(self.tokenizer, PREFIX_STR + self.special_tokens["turn_start"])
-            id2 = _tokenize(self.tokenizer, PREFIX_STR)
-            self.num_turn_start_tokens = len(id1) - len(id2)
-
-    def _process_example(self, example):
-        """
-        Create an example by concatenating text and answer.
-        Truncation is carried out when needed, but it is performed only on the prompt side.
-        BOS, EOS, and SEP, are added if specified.
-        """
-        if not self.use_hf_tokenizer_chat_template:
-            # Use legacy special token-based preprocessing
-            result = _preprocess(
-                example,
-                self.tokenizer,
-                self.name_end_token_ids,
-                self.label_start_tokens,
-                self.special_tokens,
-                self.num_turn_start_tokens,
-            )
-        else:
-            # Use HuggingFace chat template preprocessing
-            result = _chat_preprocess(example, self.tokenizer, self.tool_schemas)
-
-        # store metadata in dataset, in case user may have keys required in the prediction json files
-        metadata = {k: v for k, v in example.items() if k not in ["conversations", "messages"]}
-        result["metadata"] = metadata
-        if self.output_original_text:
-            # Store original conversation/messages for both formats
-            for key in ["conversations", "messages"]:
-                if key in example:
-                    result["metadata"][key] = example[key]
-
-        return result
-
-    def collate_fn(self, batch):
-        """
-        Collates a list of processed chat examples into a batch for model input.
-
-        This function takes a list of individual processed chat samples (from `__getitem__`,
-        which internally uses `_process_example`) and groups them into a batch. It handles
-        padding of sequences to the maximum length in the batch (or `self.max_seq_length`
-        if `pad_to_max_length` is True), and prepares all necessary tensors for the model,
-        similar to the base class collate_fn but specific to chat data structure.
-
-        Args:
-            batch (List[dict]): A list of dictionaries, where each dictionary is a
-                                sample processed by `_process_example`.
-
-        Returns:
-            dict: A dictionary of batched tensors ready for model input. Key tensors include
-                  'tokens', 'labels', 'loss_mask', 'position_ids', and 'attention_mask'.
-        """
-        # Removes the last token from each input sequence to ensure the model
-        # never sees the token it is supposed to predict. This enforces an
-        # autoregressive training setup where the model learns to generate
-        # the next token step-by-step.
-        input_ids = [item["input_ids"][:-1].tolist() for item in batch]
-        # Removes the first token from each input sequence to create labels
-        # that align with the model's prediction target. This ensures that
-        # at time step `t`, the model's output is evaluated against the token
-        # that originally followed the input at `t` in the dataset.
-        labels = [item["input_ids"][1:].tolist() for item in batch]
-        # Context tokens remain unchanged, representing the initial portion of
-        # the sequence that serves as input to the model. This allows the model
-        # to condition its predictions on prior information.
-        contexts = [item["context_ids"].tolist() for item in batch]
-        # Extracts the assistant's response portion of the sequence, which
-        # represents the part the model is trained to generate. This helps
-        # distinguish between the input prompt and the expected model output.
-        answers = [item["answer_ids"].tolist() for item in batch]
-        # Removes the first element from the mask to align with the shifted labels,
-        # ensuring that loss is only computed for valid, predictable tokens. This
-        # prevents the model from incurring loss on tokens that were never meant to
-        # be predicted, such as user-provided context or padding.
-        loss_mask = [item["loss_mask"][1:].tolist() for item in batch]
-        # Metadata remains unchanged, carrying any additional non-token-related
-        # information that might be useful for evaluation, debugging, or tracking
-        # purposes.
-        metadata = [item["metadata"] for item in batch]
-        max_length = max(max([len(x) for x in input_ids]), max([len(x) for x in contexts]) + self.tokens_to_generate)
-
-        if max_length > self.max_seq_length:
-            # truncate the sequences if it is longer than max_seq_length
-            input_ids = [x[: self.max_seq_length] for x in input_ids]
-            labels = [x[: self.max_seq_length] for x in labels]
-            loss_mask = [x[: self.max_seq_length] for x in loss_mask]
-
-            # Safety check: warn if truncation removed all trainable tokens
-            for i, x in enumerate(loss_mask):
-                x_tensor = torch.tensor(x)
-                if x_tensor.sum().item() == 0:
-                    logger.warning(
-                        "Due to truncation to max_seq_length, no assistant tokens are found in sample. "
-                        "Setting loss_mask to all ones."
-                    )
-                    loss_mask[i] = [1] * self.max_seq_length
-
-            contexts = [x[: self.max_seq_length] for x in contexts]
-            answers = [x[: self.max_seq_length] for x in answers]
-
-        # increase max length to nearest multiple of 4 or 8
-        if self.pad_to_max_length:
-            max_length = self.max_seq_length
-        else:
-            max_length = min(
-                self.max_seq_length,
-                self._ceil_to_nearest(max_length, max(16, self.pad_seq_length_to_mult)),
-            )
-        assert max_length <= self.max_seq_length
-
-        position_ids = [list(range(max_length)) for _ in batch]
-        position_ids = torch.LongTensor(position_ids)
-        input_ids = torch.LongTensor(
-            self._collate_item(input_ids, max_length=max_length, pad_id=self.tokenizer.eos_id)
-        )
-        labels = torch.LongTensor(self._collate_item(labels, max_length=max_length, pad_id=self.tokenizer.eos_id))
-        loss_mask = torch.LongTensor(self._collate_item(loss_mask, max_length=max_length, pad_id=0))
-        context_lengths = torch.LongTensor([len(x) for x in contexts])
-        contexts = torch.LongTensor(self._collate_item(contexts, max_length=max_length, pad_id=self.tokenizer.eos_id))
-        answers = torch.LongTensor(self._collate_item(answers, max_length=max_length, pad_id=self.tokenizer.eos_id))
-
-        processed_batch = {
-            "tokens": input_ids,
-            "labels": labels,
-            "loss_mask": loss_mask,
-            "position_ids": position_ids,
-            "contexts": contexts,
-            "context_lengths": context_lengths,
-            "answers": answers,
-            "metadata": metadata,
-        }
-
-        if not self.get_attention_mask_from_fusion:
-            attention_mask = [self._create_attention_mask(max_length) for _ in batch]
-            attention_mask = torch.stack(attention_mask)
-            processed_batch["attention_mask"] = attention_mask
-
-        return processed_batch
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/builders/finetuning_dataset.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import warnings
-from pathlib import Path
-from typing import Any, Optional, Union
-
-import torch
-from megatron.core.msc_utils import MultiStorageClientFeature
-from megatron.core.tokenizers.text.libraries import HuggingFaceTokenizer
-
-from megatron.bridge.data.datasets.packed_parquet import (
-    is_packed_parquet_spec,
-    resolve_packed_parquet_paths,
-)
-from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-from megatron.bridge.data.datasets.sft import create_sft_dataset
-from megatron.bridge.utils.common_utils import get_rank_safe, print_rank_0
-
-
-logger = logging.getLogger(__name__)
-
-
-class FinetuningDatasetBuilder:
-    """Builder class for fine-tuning datasets.
-
-    This class provides methods to build datasets for fine-tuning large language models.
-    It follows a builder pattern similar to BlendedMegatronDatasetBuilder but adapted for
-    fine-tuning scenarios.
-
-    Args:
-        dataset_root (Union[str, Path]): The root directory containing training, validation, and test data.
-        tokenizer: The tokenizer to use for preprocessing text.
-        is_built_on_rank (Callable): Function that returns True if the dataset should be built on current rank.
-        seq_length (int, optional): The maximum sequence length. Defaults to 2048.
-        seed (int, optional): Random seed for data shuffling. Defaults to 1234.
-        memmap_workers (int, optional): Number of worker processes for memmap datasets. Defaults to 1.
-        max_train_samples (int, optional): Maximum number of training samples. Defaults to None.
-        packed_sequence_specs (Optional[PackedSequenceSpecs], optional): Specifications for packed sequences. Defaults to None.
-        dataset_kwargs (Optional[dict[str, Any]], optional): Additional dataset creation arguments. Defaults to None.
-        do_validation (bool, optional): Whether to build the validation dataset. Defaults to True.
-        do_test (bool, optional): Whether to build the test dataset. Defaults to True.
-    """
-
-    def __init__(
-        self,
-        dataset_root: Union[str, Path],
-        tokenizer,
-        seq_length: int = 2048,
-        seed: int = 1234,
-        memmap_workers: int = 1,
-        max_train_samples: Optional[int] = None,
-        packed_sequence_specs: Optional[PackedSequenceSpecs] = None,
-        dataset_kwargs: Optional[dict[str, Any]] = None,
-        do_validation: bool = True,
-        do_test: bool = True,
-    ):
-        if MultiStorageClientFeature.is_enabled():
-            msc = MultiStorageClientFeature.import_package()
-            self.dataset_root = msc.Path(dataset_root)
-        else:
-            self.dataset_root = Path(dataset_root)
-        self.tokenizer = tokenizer
-        self.seq_length = seq_length
-        self.seed = seed
-        self.memmap_workers = memmap_workers
-        self.max_train_samples = max_train_samples
-        self.packed_sequence_specs = packed_sequence_specs
-        self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
-        self.dataset_kwargs = dataset_kwargs or {}
-        self._pad_cu_seqlens = False if not packed_sequence_specs else packed_sequence_specs.pad_cu_seqlens
-        self._pad_seq_to_mult = None if not packed_sequence_specs else packed_sequence_specs.pad_seq_to_mult
-
-        self.do_validation = do_validation
-        self.do_test = do_test
-
-        print_rank_0(f"Building FinetuningDatasetBuilder with root={self.dataset_root}")
-
-        if self.packed_sequence_size > 0:
-            print_rank_0(f"Using packed sequences with size {self.packed_sequence_size}")
-
-    def prepare_data(self) -> None:
-        """Prepare data if needed."""
-        self.prepare_packed_data()
-
-    def prepare_packed_data(self) -> None:
-        """Prepare packed sequence data files if configured.
-
-        Skips preparation if:
-        - packed_sequence_size <= 0 (packing disabled)
-        - packed data files already exist (parquet or legacy .npy)
-        """
-        if self.packed_sequence_size <= 0:
-            return
-
-        self._prepare_packed_split(
-            split_name="training",
-            packed_path=self.train_path_packed,
-            input_path=self.train_path,
-        )
-
-        if not self.do_validation:
-            return
-
-        self._prepare_packed_split(
-            split_name="validation",
-            packed_path=self.validation_path_packed,
-            input_path=self.validation_path,
-        )
-
-    def _prepare_packed_split(
-        self,
-        split_name: str,
-        packed_path: Union[str, Path],
-        input_path: Path,
-    ) -> None:
-        """Prepare a single packed data split if it doesn't already exist.
-
-        Args:
-            split_name: Name of the split (for logging).
-            packed_path: Output path for the packed data.
-            input_path: Input path to the raw dataset.
-        """
-        from megatron.bridge.data.datasets.packed_sequence import prepare_packed_sequence_data
-
-        if self._packed_path_exists(packed_path):
-            print_rank_0(f"Skipping packed {split_name} data preparation - already exists: {packed_path}")
-            return
-
-        packed_path_str = str(packed_path)
-        if packed_path_str.lower().endswith(".npy"):
-            warnings.warn(
-                "Automatic .npy packed sequence preparation is deprecated and will be removed in the next release. "
-                "Please use packed parquet format instead.",
-                DeprecationWarning,
-                stacklevel=3,
-            )
-            return
-
-        print_rank_0(f"Preparing packed {split_name} data at {packed_path}")
-        prepare_packed_sequence_data(
-            input_path=input_path,
-            output_path=packed_path,
-            output_metadata_path=self.pack_metadata,
-            packed_sequence_size=self.packed_sequence_size,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.seq_length,
-            seed=self.seed,
-            dataset_kwargs=self.dataset_kwargs,
-            pad_seq_to_mult=self._pad_seq_to_mult,
-        )
-
-    def _packed_path_exists(self, path: Union[str, Path]) -> bool:
-        """Check if a packed data path exists.
-
-        For .npy files: check file exists
-        For packed parquet specs: check if resolution returns non-empty
-
-        Args:
-            path: The path to check
-
-        Returns:
-            True if the packed data exists
-        """
-        path_str = str(path)
-
-        # For packed parquet specs, check if resolution returns files
-        if is_packed_parquet_spec(path_str):
-            try:
-                resolved = resolve_packed_parquet_paths(path_str)
-                return len(resolved) > 0
-            except ValueError:
-                return False
-
-        # For .npy or other files, check existence
-        if MultiStorageClientFeature.is_enabled():
-            msc = MultiStorageClientFeature.import_package()
-            return msc.Path(path_str).is_file()
-        else:
-            return Path(path_str).is_file()
-
-    def build(self) -> list[Optional[Any]]:
-        """Build train, validation, and test datasets.
-
-        This method creates the necessary datasets based on the configuration.
-        It first ensures data preparation (e.g., packing) is done (on rank 0),
-        then builds the datasets potentially using the prepared files.
-
-        Returns:
-            A list containing the train, validation, and test datasets.
-            Elements can be None if the corresponding data file doesn't exist
-            or if dataset building is skipped for the split.
-        """
-        # Prepare packed data if needed
-        if get_rank_safe() == 0:
-            self.prepare_data()
-
-        if torch.distributed.is_initialized():
-            torch.distributed.barrier()
-
-        # This needs to be called on all ranks
-        datasets: list[Optional[Any]] = self._build_datasets()
-        return datasets
-
-    def _build_datasets(self) -> list[Optional[Any]]:
-        """Internal method to build all datasets.
-
-        Returns:
-            list[Optional[Any]]: The train, validation, and test datasets.
-        """
-        train_ds = self._create_dataset(
-            self.train_path if self.packed_sequence_size <= 0 else self.train_path_packed,
-            pack_metadata_path=None if self.packed_sequence_size <= 0 else self.pack_metadata,
-            max_num_samples=self.max_train_samples,
-            **self.dataset_kwargs,
-        )
-
-        if self.do_validation:
-            valid_ds = self._create_dataset(
-                self.validation_path if self.packed_sequence_size <= 0 else self.validation_path_packed,
-                pack_metadata_path=None if self.packed_sequence_size <= 0 else self.pack_metadata,
-                is_test=True,
-                **self.dataset_kwargs,
-            )
-        else:
-            valid_ds = None
-
-        if self.do_test:
-            test_ds = self._create_dataset(
-                self.test_path,
-                is_test=True,
-                **self.dataset_kwargs,
-            )
-        else:
-            test_ds = None
-
-        return [train_ds, valid_ds, test_ds]
-
-    def _create_dataset(
-        self,
-        path: Union[str, Path],
-        pack_metadata_path: Optional[Union[str, Path]] = None,
-        is_test: bool = False,
-        **kwargs: Any,
-    ) -> Optional[Any]:
-        """Create a single dataset instance (train, validation, or test).
-
-        Args:
-            path: Path to the dataset file or packed parquet spec
-            pack_metadata_path: Path to the packed sequence metadata
-            is_test: Whether this is a test dataset
-            **kwargs: Additional arguments to pass to the dataset constructor
-
-        Returns:
-            The created dataset
-        """
-        path_str = str(path)
-
-        # Check if path exists - handle packed parquet specs differently
-        if is_packed_parquet_spec(path_str):
-            # For packed parquet specs, check via resolution
-            try:
-                resolved = resolve_packed_parquet_paths(path_str)
-                path_exists = len(resolved) > 0
-            except ValueError:
-                path_exists = False
-        else:
-            # Standard file/path existence check
-            if MultiStorageClientFeature.is_enabled():
-                msc = MultiStorageClientFeature.import_package()
-                path_exists = msc.Path(path_str).exists()
-            else:
-                path_exists = Path(path_str).exists()
-
-        if not path_exists:
-            print_rank_0(f"Warning: Dataset path {path} does not exist")
-            return None
-
-        is_not_packing = self.packed_sequence_size <= 0
-
-        # For packed parquet from external sources, only pass metadata if pad_cu_seqlens is True
-        # This avoids "missing metadata" errors when using externally prepared packed data
-        effective_metadata_path = None
-        if not is_not_packing:
-            if self._pad_cu_seqlens:
-                # pad_cu_seqlens requires metadata
-                effective_metadata_path = pack_metadata_path
-            elif is_packed_parquet_spec(path_str):
-                # Externally prepared packed parquet without pad_cu_seqlens doesn't need metadata
-                effective_metadata_path = None
-            else:
-                # .npy files prepared by MB include metadata
-                effective_metadata_path = pack_metadata_path
-
-        return create_sft_dataset(
-            path,
-            tokenizer=self.tokenizer,
-            seq_length=(self.seq_length if is_not_packing else self.packed_sequence_size),
-            memmap_workers=self.memmap_workers,
-            seed=self.seed,
-            is_test=is_test,
-            pack_metadata_file_path=effective_metadata_path,
-            pad_cu_seqlens=False if is_not_packing else self._pad_cu_seqlens,
-            pad_seq_to_mult=1 if is_not_packing else self._pad_seq_to_mult,
-            **kwargs,
-        )
-
-    @property
-    def train_path(self) -> Path:
-        """Path to the training dataset file (training.jsonl)."""
-        return self.dataset_root / "training.jsonl"
-
-    @property
-    def default_pack_path(self) -> Path:
-        """The default directory path for storing packed sequence files.
-
-        Constructed based on the dataset root and tokenizer model name.
-        Creates the directory if it doesn't exist.
-
-        Returns:
-            The Path object for the default packing directory.
-        """
-        tokenizer_model_name = self._extract_tokenizer_model_name()
-        default_pack_path = (
-            self.dataset_root / "packed" / f"{tokenizer_model_name}_pad_seq_to_mult{self._pad_seq_to_mult}"
-        )
-        if not default_pack_path.exists():
-            default_pack_path.mkdir(parents=True, exist_ok=True)
-            logger.info(f"Using default path for packing files: {str(default_pack_path)}")
-
-        return default_pack_path
-
-    @property
-    def pack_metadata(self) -> Path:
-        """Path to the metadata file for packed sequences.
-
-        Determined by `packed_sequence_specs` or defaults based on the
-        `default_pack_path` and `packed_sequence_size`.
-
-        Returns:
-            The Path object for the packed sequence metadata file.
-
-        Raises:
-            ValueError: If packed sequences are not configured.
-        """
-        if self.packed_sequence_size > 0:
-            if self.packed_sequence_specs.packed_metadata_path is not None:
-                return self.packed_sequence_specs.packed_metadata_path
-            return self.default_pack_path / f"{self.packed_sequence_size}_metadata.jsonl"
-        else:
-            raise ValueError("pack_metadata invalid since packed sequence size is not specified.")
-
-    @property
-    def train_path_packed(self) -> Path:
-        """Path to the packed training dataset file.
-
-        Determined by `packed_sequence_specs` or defaults based on the
-        `default_pack_path` and `packed_sequence_size`.
-
-        Returns:
-            The Path object for the packed training data file.
-
-        Raises:
-            ValueError: If packed sequences are not configured.
-        """
-        if self.packed_sequence_size > 0:
-            if self.packed_sequence_specs.packed_train_data_path is not None:
-                return self.packed_sequence_specs.packed_train_data_path
-            return self.default_pack_path / f"training_{self.packed_sequence_size}.idx.parquet"
-        else:
-            raise ValueError("`train_path_packed` invalid since packed sequence size is not specified.")
-
-    @property
-    def validation_path_packed(self) -> Path:
-        """Path to the packed validation dataset file.
-
-        Determined by `packed_sequence_specs` or defaults based on the
-        `default_pack_path` and `packed_sequence_size`.
-
-        Returns:
-            The Path object for the packed validation data file.
-
-        Raises:
-            ValueError: If packed sequences are not configured.
-        """
-        if self.packed_sequence_size > 0:
-            if self.packed_sequence_specs.packed_val_data_path is not None:
-                return self.packed_sequence_specs.packed_val_data_path
-            return self.default_pack_path / f"validation_{self.packed_sequence_size}.idx.parquet"
-        else:
-            raise ValueError("`validation_path_packed` invalid since packed sequence size is not specified.")
-
-    @property
-    def validation_path(self) -> Path:
-        """Path to the validation dataset file (validation.jsonl)."""
-        return self.dataset_root / "validation.jsonl"
-
-    @property
-    def test_path(self) -> Path:
-        """Path to the test dataset file (test.jsonl)."""
-        return self.dataset_root / "test.jsonl"
-
-    def _extract_tokenizer_model_name(self) -> str:
-        """Automatically get the model name from model path."""
-        # Legacy tokenizer compatibility
-        tokenizer_cls = HuggingFaceTokenizer
-        tokenizer_instance = self.tokenizer._tokenizer
-
-        if self.packed_sequence_specs and self.packed_sequence_specs.tokenizer_model_name is not None:
-            return self.packed_sequence_specs.tokenizer_model_name
-        elif isinstance(tokenizer_instance, tokenizer_cls):
-            name = self.tokenizer.path
-
-            if name.endswith("context/nemo_tokenizer"):
-                # NEMO_HOME/hf_org/hf_model/context/nemo_tokenizer => hf_org--hf_model
-                tokenizer_model_name = "--".join(name.split("/")[-4:-2])
-            elif name.endswith("nemo_tokenizer"):
-                # NEMO_HOME/hf_org/hf_model/nemo_tokenizer => hf_org--hf_model
-                tokenizer_model_name = "--".join(name.split("/")[-3:-1])
-            else:
-                # hf_org/hf_model => hf_org--hf_model
-                tokenizer_model_name = name.replace("/", "--")
-            return tokenizer_model_name
-        else:
-            return f"unknown_tokenizer_{hash(self.tokenizer)}"
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/datasets/packed_sequence.py
-```py
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import logging
-import multiprocessing as mp
-import warnings
-from dataclasses import dataclass
-from multiprocessing import Pool
-from pathlib import Path
-
-import numpy as np
-from megatron.core.msc_utils import MultiStorageClientFeature
-from tqdm import tqdm
-
-from megatron.bridge.data.datasets.packed_parquet import (
-    is_packed_parquet_spec,
-    resolve_packed_parquet_paths,
-)
-from megatron.bridge.data.datasets.packing_utils import create_hist, create_packing_strategy, fill_packing_strategy
-from megatron.bridge.data.datasets.sft import create_sft_dataset
-from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
-
-
-logger = logging.getLogger(__name__)
-
-_shared_dataset = None
-
-
-def _tokenize_get_item(i):
-    return _shared_dataset[i]
-
-
-def _tokenize_init_worker(dataset):
-    global _shared_dataset
-    _shared_dataset = dataset
-
-
-def _retrieve_tokenized(dataset, num_workers):
-    if num_workers == 1:
-        return np.array([dataset[i] for i in tqdm(range(len(dataset)))])
-    num_workers = num_workers if num_workers > 0 else mp.cpu_count()
-    with Pool(num_workers, initializer=_tokenize_init_worker, initargs=(dataset,)) as pool:
-        return np.array(list(tqdm(pool.imap(_tokenize_get_item, range(len(dataset))), total=len(dataset))))
-
-
-def tokenize_dataset(
-    path: Path,
-    tokenizer: MegatronTokenizer,
-    max_seq_length: int,
-    seed: int,
-    dataset_kwargs: dict | None = None,
-    pad_seq_to_mult: int | None = 1,
-    num_tokenizer_workers: int = -1,
-):
-    """
-    Tokenizes a dataset from the provided path using the specified tokenizer
-    and prepares it for further processing.
-
-    Args:
-        path (Path): Path to the dataset file.
-        tokenizer (MegatronTokenizer): The tokenizer to use for tokenization.
-        max_seq_length (int): Maximum sequence length for the tokens.
-        seed (int): Random seed for shuffling the dataset.
-        dataset_kwargs (dict | None): Additional keyword arguments to pass to create_sft_dataset.
-            Can include 'chat', 'use_hf_tokenizer_chat_template', 'tool_schemas', etc.
-        pad_seq_to_mult (int | None): Optional multiple to pad each sequence to during packing
-            preparation (e.g., set to 2 * context_parallel_size for THD CP).
-
-    Returns:
-        np.ndarray: A NumPy array containing the tokenized data.
-    """
-    if not dataset_kwargs:
-        dataset_kwargs = {}
-
-    # Handle tool_schemas - convert to JSON string if needed
-    ts = dataset_kwargs.get("tool_schemas")
-    if ts and not isinstance(ts, str):
-        dataset_kwargs["tool_schemas"] = json.dumps(ts)
-
-    # Handle chat_template - set it on tokenizer if provided
-    chat_template = dataset_kwargs.pop("chat_template", None)
-    if chat_template:
-        # This is called during packing preparation (rank 0 only).
-        # The chat template is only needed to create the packed .npy files.
-        # Once created, all ranks load the pre-tokenized .npy files.
-        if hasattr(tokenizer, "_tokenizer"):
-            tokenizer._tokenizer.chat_template = chat_template
-
-    if pad_seq_to_mult is not None and pad_seq_to_mult <= 0:
-        raise ValueError("pad_seq_to_mult must be a positive integer when provided.")
-
-    # Keep the historical minimum of 16 unless a larger multiple is requested.
-    pad_seq_length_to_mult = 1 if pad_seq_to_mult is None else max(1, pad_seq_to_mult)
-
-    dataset = create_sft_dataset(
-        path=path,
-        tokenizer=tokenizer,
-        seq_length=max_seq_length,
-        seed=seed,
-        is_test=True,
-        pad_seq_length_to_mult=pad_seq_length_to_mult,
-        **dataset_kwargs,
-    )
-
-    pad_id = dataset.tokenizer.eod
-    pad_seq_length_to_mult = dataset.pad_seq_length_to_mult
-    max_seq_length = dataset.max_seq_length
-    dataset = _retrieve_tokenized(dataset, num_tokenizer_workers)
-
-    if pad_seq_to_mult > 1:
-
-        def pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id):
-            """
-            Pad each individual data point to the length of max_length_to_pad.
-            This keeps packed samples divisible by the requested multiple (used for CP/THD).
-            """
-            assert max_seq_length >= max_length_to_pad
-            for key, val in data.items():
-                if key in {"input_ids", "context_ids"}:
-                    if len(val) <= max_length_to_pad:
-                        # input_ids are truncated by 1 for labels; add 1 extra pad token
-                        val = val + [pad_id] * (max_length_to_pad - len(val) + 1)
-                    elif len(val) > max_seq_length:
-                        logging.info(
-                            "Sequence length %d is larger than max_seq_length %d; truncating for packing.",
-                            len(val),
-                            max_seq_length,
-                        )
-                        val = val[:max_seq_length]
-                    data[key] = val
-            return
-
-        def ceil_to_nearest(n, m):
-            return (n + m - 1) // m * m
-
-        for data in dataset:
-            max_length_to_pad = min(max_seq_length, ceil_to_nearest(len(data["input_ids"]), pad_seq_length_to_mult))
-            pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id)
-
-    return dataset
-
-
-def prepare_packed_sequence_data(
-    input_path: Path,
-    output_path: Path,
-    output_metadata_path: Path,
-    packed_sequence_size: int,
-    tokenizer: MegatronTokenizer,
-    max_seq_length: int,
-    seed: int | None = 0,
-    packing_algorithm: str = "first_fit_shuffle",
-    dataset_kwargs: dict | None = None,
-    pad_seq_to_mult: int | None = 1,
-    num_tokenizer_workers: int = -1,
-):
-    """
-    Prepares a packed sequence dataset from a given input file and saves it to an output file.
-
-    Args:
-        input_path (Path): Path to the input dataset file.
-        output_path (Path): Path to save the packed sequence data.
-        output_metadata_path (Path): Path to save packing metadata.
-        packed_sequence_size (int): The maximum size for each packed sequence.
-        tokenizer (MegatronTokenizer): The tokenizer to use for tokenization.
-        max_seq_length (int): Maximum sequence length for the tokens.
-        seed (int | None): Random seed for shuffling (optional).
-        packing_algorithm (str): The algorithm used for packing sequences
-                currently supports "first_fit_shuffle" and "first_fit_decreasing".
-        dataset_kwargs (dict | None): Additional keyword arguments to pass to create_sft_dataset.
-            Enables packing with chat templates, tool schemas, etc.
-        pad_seq_to_mult (int | None): Optional multiple to pad each sequence to during packing
-            preparation (e.g., set to 2 * context_parallel_size for THD CP).
-
-    Returns:
-        None: Saves the packed sequence data to the specified output path.
-    """
-    logger.info(f"Preparing packed sequence from {input_path}")
-    dataset = tokenize_dataset(
-        input_path,
-        tokenizer,
-        max_seq_length,
-        seed,
-        dataset_kwargs,
-        pad_seq_to_mult=pad_seq_to_mult,
-        num_tokenizer_workers=num_tokenizer_workers,
-    )
-    sequences, histogram = create_hist(dataset, max_seq_length)
-
-    assignments, packing_metadata = create_packing_strategy(histogram, packed_sequence_size, packing_algorithm)
-    output_data = fill_packing_strategy(assignments, sequences, packed_sequence_size, tokenizer.eos_id)
-
-    # save output data
-    output_path_str = str(output_path)
-    if output_path_str.lower().endswith((".parquet", ".pq")):
-        from megatron.bridge.data.datasets.packed_parquet import write_packed_parquet
-
-        write_packed_parquet(output_data, output_path)
-    else:
-        # Legacy .npy format
-        if MultiStorageClientFeature.is_enabled():
-            msc = MultiStorageClientFeature.import_package()
-            msc.numpy.save(output_path, output_data)
-        else:
-            np.save(output_path, output_data)
-
-    # save packing metadata, packing_metadata is appended to the packing file if it exists
-    if output_metadata_path is not None:
-        try:
-            with output_metadata_path.open(mode="r") as f:
-                packing_metadata_file = json.load(f)
-                # 'packing_metadata_file' is expected to be a list of dicts: List[Dict[str, int]]
-                # Each dict corresponds to a packed dataset. Typically there will be two dicts,
-                # one each for the packed val and train datasets.
-                # Each dict records two values: 'max_samples_per_bin', the max
-                # number of samples per packed sequence, and 'dataset_max_seqlen', the max
-                # sequence length per sample in the packed dataset.
-                assert isinstance(packing_metadata_file, list), "invalid packing_metadata_file!"
-        except FileNotFoundError:
-            packing_metadata_file = []
-
-        packing_metadata_file.append(packing_metadata)
-        with output_metadata_path.open(mode="w") as f:
-            json.dump(packing_metadata_file, f)
-
-    logger.info(f"Packed sequence is prepared and saved to {output_path}")
-
-
-@dataclass
-class PackedSequenceSpecs:
-    """
-    Configuration class for packed sequence datasets.
-
-    This class holds parameters related to sequence packing, including the size of the packed sequences,
-    tokenizer information, paths to packed data files, and other related settings.
-    """
-
-    packed_sequence_size: int = -1
-    """
-    If a positive integer, this arg enables training with sequence packing and specifies the pack size
-    If less than or equal to 0, sequence packing is disabled. Defaults to -1.
-    Note: This arg is distinct from `seq_length` because `seq_length` specifies the maximum length
-    of the original sequence (i.e. the length to truncate long sequences in the input data).
-    """
-
-    tokenizer_model_name: str = None
-    """
-    Keep track of tokenizer model name, since each tokenizer produces a different packed sequence dataset file.
-    This field is set by llm.finetune api.
-    """
-
-    num_tokenizer_workers: int = -1
-    """
-    The number of worker processes to use for tokenization when preparing the packed sequence dataset.
-    If -1, the number of workers will be set to the number of CPU cores available
-    """
-
-    packed_train_data_path: str = None
-    """
-    If specified, use this file for the packed training dataset instead of the default path.
-    """
-
-    packed_val_data_path: str = None
-    """
-    If specified, use this file for the packed validation dataset instead of the default path.
-    """
-
-    packed_metadata_path: str = None
-    """
-    If specified, use this file for the training and validation packing metadata file instead of the default path.
-    """
-
-    pad_cu_seqlens: bool = False
-    """
-    If True, pad cu_seqlens to a constant size, which is required for use with cudagraphs.
-    """
-    pad_seq_to_mult: int | None = 1
-    """
-    Optional multiple to pad each sample to when generating packed datasets.
-    For THD/context parallel, set to (context_parallel_size * 2) to keep samples divisible.
-    """
-
-    def __post_init__(self):
-        if self.packed_train_data_path is not None:
-            self._validate_packed_path("packed_train_data_path", self.packed_train_data_path)
-
-        if self.packed_val_data_path is not None:
-            self._validate_packed_path("packed_val_data_path", self.packed_val_data_path)
-
-        if self.pad_seq_to_mult is not None and self.pad_seq_to_mult <= 0:
-            raise ValueError("pad_seq_to_mult must be a positive integer when provided.")
-
-    def _validate_packed_path(self, attr_name: str, path_value: str) -> None:
-        """Validate a packed data path and store it appropriately.
-
-        For .npy files: strict validation with Path.exists()
-        For packed parquet specs: validate via resolution (supports dirs/globs)
-
-        Args:
-            attr_name: The attribute name being validated (for error messages)
-            path_value: The path value to validate
-
-        Raises:
-            FileNotFoundError: If the path does not exist or resolves to no files
-            ValueError: If the path format is invalid
-        """
-        path_str = str(path_value)
-
-        # Check if it's an .npy file (legacy format)
-        if path_str.lower().endswith(".npy"):
-            warnings.warn(
-                f"The .npy packed sequence format is deprecated and will be removed in the next release. "
-                f"Please use packed parquet format instead. Path: {path_str}",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            if MultiStorageClientFeature.is_enabled():
-                msc = MultiStorageClientFeature.import_package()
-                path_obj = msc.Path(path_str)
-            else:
-                path_obj = Path(path_str)
-
-            if not path_obj.exists():
-                raise FileNotFoundError(f"{attr_name} file does not exist: {path_str}")
-            setattr(self, attr_name, path_obj)
-            return
-
-        # Check if it's a packed parquet spec (file/dir/glob)
-        if is_packed_parquet_spec(path_str):
-            # Validate by resolving - this checks that files actually exist
-            try:
-                resolved_paths = resolve_packed_parquet_paths(path_str)
-                if len(resolved_paths) == 0:
-                    raise FileNotFoundError(f"{attr_name} resolved to no files: {path_str}")
-            except ValueError as e:
-                raise FileNotFoundError(f"{attr_name} could not be resolved: {path_str}. Error: {e}") from e
-
-            # Store the original string spec (not Path) to preserve globs
-            # The dataset loader will handle resolution
-            setattr(self, attr_name, path_str)
-            return
-
-        # Neither .npy nor valid packed parquet spec
-        raise ValueError(
-            f"{attr_name} must be a .npy file or a packed parquet spec "
-            f"(file/directory/glob ending in .parquet or .pq): {path_str}"
-        )
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/data/datasets/packed_parquet.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Packed Parquet dataset support for SFT training.
-
-This module provides GPTSFTPackedParquetDataset, which reads packed sequence data
-from Parquet files as an alternative to the NumPy-based GPTSFTPackedDataset.
-
-Supports multiple files via:
-- Single file: "data.idx.parquet", "shard_0.parquet"
-- Glob pattern: "data*.idx.parquet", "shard_*.parquet"
-- Directory: "/path/to/data/" (globs for *.parquet and *.pq)
-
-Key functions:
-- is_packed_parquet_spec(): Check if a spec refers to packed Parquet data
-- resolve_packed_parquet_paths(): Resolve a spec to actual file paths
-"""
-
-from __future__ import annotations
-
-import bisect
-import glob
-import logging
-import os
-from pathlib import Path
-from typing import TYPE_CHECKING
-
-import numpy as np
-from megatron.core.msc_utils import MultiStorageClientFeature
-
-from megatron.bridge.data.datasets.sft import GPTSFTPackedDataset
-
-
-if TYPE_CHECKING:
-    from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
-
-logger = logging.getLogger(__name__)
-
-# Required columns in packed Parquet schema
-REQUIRED_COLUMNS = {"input_ids", "seq_start_id", "loss_mask"}
-
-
-def is_packed_parquet_file(path) -> bool:
-    """Check if a path refers to a packed Parquet file or pattern.
-
-    Args:
-        path: A Path object or string path.
-
-    Returns:
-        True if the path ends with .idx.parquet or .idx.pq, or contains a glob
-        pattern that would match such files.
-    """
-    name = str(path).lower()
-    # Matches both direct files and glob patterns (e.g., "data*.idx.parquet")
-    # since both end with the extension.
-    return name.endswith(".idx.parquet") or name.endswith(".idx.pq")
-
-
-def is_packed_parquet_spec(spec: str | Path) -> bool:
-    """Check if a spec refers to a packed Parquet source (file, directory, or glob).
-
-    This predicate reflects what the dataset loader supports in packed mode:
-    - Single .parquet/.idx.parquet/.idx.pq files
-    - Glob patterns ending in .parquet/.idx.parquet/.idx.pq
-    - Directories containing parquet files
-
-    Args:
-        spec: A path specification (file, directory, or glob pattern).
-
-    Returns:
-        True if the spec could refer to packed Parquet data.
-    """
-    spec_str = str(spec).lower()
-
-    # Check for parquet file extensions (including glob patterns)
-    if spec_str.endswith(".parquet") or spec_str.endswith(".pq"):
-        return True
-
-    # Check for glob patterns containing parquet extension
-    if "*" in spec_str or "?" in spec_str:
-        # Extract the pattern part after the last glob character
-        return ".parquet" in spec_str or ".pq" in spec_str
-
-    # For directories, try to resolve to parquet files
-    # This is more robust than is_dir() on distributed filesystems (Lustre, S3, etc.)
-    try:
-        resolved = _resolve_parquet_paths(str(spec))
-        return len(resolved) > 0
-    except ValueError:
-        pass
-
-    # Fallback: check if it's a directory using filesystem abstraction
-    if MultiStorageClientFeature.is_enabled():
-        msc = MultiStorageClientFeature.import_package()
-        msc_path = msc.Path(str(spec))
-        return msc_path.is_dir() if hasattr(msc_path, "is_dir") else False
-    else:
-        return Path(spec).is_dir()
-
-
-def _lazy_import_pyarrow():
-    """Lazily import pyarrow and raise a clear error if not installed."""
-    try:
-        import pyarrow
-        import pyarrow.parquet as pq
-
-        return pyarrow, pq
-    except ImportError as e:
-        raise ImportError(
-            "pyarrow is required for packed Parquet datasets but is not installed. "
-            "Please reinstall megatron-bridge or run: pip install pyarrow>=14.0.0"
-        ) from e
-
-
-def _is_parquet_file(path: str) -> bool:
-    """Check if a path refers to any Parquet file.
-
-    Args:
-        path: A string path.
-
-    Returns:
-        True if the path ends with .parquet or .pq (case-insensitive).
-    """
-    name = path.lower()
-    return name.endswith(".parquet") or name.endswith(".pq")
-
-
-def _resolve_parquet_paths(file_path: str) -> list[str]:
-    """Resolve a file path specification to a list of actual file paths.
-
-    Supports:
-    - Single file: "data.idx.parquet", "shard_0.parquet"
-    - Glob pattern: "data*.idx.parquet", "shard_*.parquet"
-    - Directory: "/path/to/data/" (globs for *.parquet and *.pq)
-
-    Args:
-        file_path: Path specification (file, glob pattern, or directory).
-
-    Returns:
-        Sorted list of resolved file paths.
-
-    Raises:
-        ValueError: If no matching files are found.
-    """
-    path_str = str(file_path)
-
-    # Check if it's a glob pattern
-    if "*" in path_str or "?" in path_str:
-        if MultiStorageClientFeature.is_enabled():
-            msc = MultiStorageClientFeature.import_package()
-            # MSC glob support - normalize to strings immediately
-            if hasattr(msc, "glob"):
-                paths = [str(p) for p in msc.glob(path_str)]
-            else:
-                # Fallback: try to use msc.Path with glob
-                # Use msc.Path to split parent/pattern to handle URIs correctly
-                msc_full_path = msc.Path(path_str)
-                parent = str(msc_full_path.parent) if hasattr(msc_full_path, "parent") else None
-                pattern = msc_full_path.name if hasattr(msc_full_path, "name") else None
-
-                if parent is not None and pattern is not None:
-                    msc_parent_path = msc.Path(parent)
-                    if hasattr(msc_parent_path, "glob"):
-                        paths = [str(p) for p in msc_parent_path.glob(pattern)]
-                    else:
-                        raise ValueError(f"MSC backend does not support glob operations for pattern: {path_str}")
-                else:
-                    raise ValueError(f"MSC backend does not support glob operations for pattern: {path_str}")
-        else:
-            paths = glob.glob(path_str)
-
-        # Filter to only parquet files (accepts both *.parquet and *.idx.parquet)
-        paths = [p for p in paths if _is_parquet_file(p)]
-        paths = sorted(paths)
-
-        if not paths:
-            raise ValueError(
-                f"No Parquet files found matching pattern: {path_str}. Files must end with .parquet or .pq"
-            )
-        return paths
-
-    # Check if it's a directory
-    if MultiStorageClientFeature.is_enabled():
-        msc = MultiStorageClientFeature.import_package()
-        msc_path = msc.Path(path_str)
-        is_dir = msc_path.is_dir() if hasattr(msc_path, "is_dir") else False
-    else:
-        is_dir = Path(path_str).is_dir()
-
-    if is_dir:
-        # Glob for parquet files in directory (accepts both *.parquet and *.idx.parquet)
-        paths = []
-        for ext in ["*.parquet", "*.pq"]:
-            pattern = os.path.join(path_str, ext)
-            if MultiStorageClientFeature.is_enabled():
-                msc = MultiStorageClientFeature.import_package()
-                if hasattr(msc, "glob"):
-                    # Normalize to strings immediately
-                    paths.extend([str(p) for p in msc.glob(pattern)])
-                elif hasattr(msc.Path(path_str), "glob"):
-                    paths.extend([str(p) for p in msc.Path(path_str).glob(ext)])
-            else:
-                paths.extend(glob.glob(pattern))
-
-        paths = sorted(set(paths))
-
-        if not paths:
-            raise ValueError(f"No Parquet files found in directory: {path_str}. Files must end with .parquet or .pq")
-        return paths
-
-    # Single file - verify it has a parquet extension and exists
-    if not _is_parquet_file(path_str):
-        return []
-
-    if MultiStorageClientFeature.is_enabled():
-        msc = MultiStorageClientFeature.import_package()
-        exists = msc.Path(path_str).exists()
-    else:
-        exists = Path(path_str).exists()
-
-    if not exists:
-        raise ValueError(f"Packed Parquet file not found: {path_str}")
-
-    return [path_str]
-
-
-def resolve_packed_parquet_paths(spec: str | Path) -> list[str]:
-    """Resolve a packed parquet spec to a list of shard file paths.
-
-    Public wrapper around the internal _resolve_parquet_paths function.
-    Use this to validate and resolve packed parquet specs before dataset creation.
-
-    Supports:
-    - Single file: "data.idx.parquet", "shard_0.parquet"
-    - Glob pattern: "data*.idx.parquet", "shard_*.parquet"
-    - Directory: "/path/to/data/" (globs for *.parquet and *.pq)
-
-    Args:
-        spec: Path specification (file, glob pattern, or directory).
-
-    Returns:
-        Sorted list of resolved file paths.
-
-    Raises:
-        ValueError: If no matching files are found.
-    """
-    return _resolve_parquet_paths(str(spec))
-
-
-def write_packed_parquet(
-    rows: list[dict],
-    output_path: str | Path,
-    row_group_size: int = 500,
-) -> None:
-    """Write packed sequence data to a Parquet file.
-
-    Args:
-        rows: List of dicts with keys 'input_ids', 'loss_mask', 'seq_start_id'.
-              This is the output format of fill_packing_strategy().
-        output_path: Path to write the Parquet file.
-        row_group_size: Number of rows per row group (default 500).
-    """
-    pa, pq = _lazy_import_pyarrow()
-
-    table = pa.table(
-        {
-            "input_ids": [row["input_ids"] for row in rows],
-            "loss_mask": [row["loss_mask"] for row in rows],
-            "seq_start_id": [row["seq_start_id"] for row in rows],
-        }
-    )
-
-    if MultiStorageClientFeature.is_enabled():
-        msc = MultiStorageClientFeature.import_package()
-        buf = pa.BufferOutputStream()
-        pq.write_table(table, buf, row_group_size=row_group_size)
-        with msc.open(str(output_path), "wb") as f:
-            f.write(buf.getvalue().to_pybytes())
-    else:
-        pq.write_table(table, str(output_path), row_group_size=row_group_size)
-
-
-class GPTSFTPackedParquetDataset(GPTSFTPackedDataset):
-    """Dataset for packed sequences stored in Parquet format.
-
-    This class reads packed training data from Parquet files with the naming convention
-    *.idx.parquet or *.idx.pq. It inherits from GPTSFTPackedDataset to reuse the
-    collate_fn() and loss-mask semantics.
-
-    Supports multiple files via:
-    - Single file: "data.idx.parquet"
-    - Glob pattern: "data*.idx.parquet" or "shard_*.idx.pq"
-    - Directory: "/path/to/data/" (globs for *.idx.parquet and *.idx.pq)
-
-    The Parquet file(s) must contain the following columns:
-        - input_ids: list<int32> - Token IDs for the packed sequence
-        - seq_start_id: list<int32> - Start offsets for each sub-sequence within the pack
-        - loss_mask: list<int8> - Per-token loss mask (0 or 1), same length as input_ids
-
-    Example:
-        >>> # Single file
-        >>> dataset = GPTSFTPackedParquetDataset(
-        ...     file_path="packed_data.idx.parquet",
-        ...     tokenizer=tokenizer,
-        ... )
-        >>> # Multiple files via glob
-        >>> dataset = GPTSFTPackedParquetDataset(
-        ...     file_path="data/shard_*.idx.parquet",
-        ...     tokenizer=tokenizer,
-        ... )
-    """
-
-    def __init__(
-        self,
-        file_path: str,
-        tokenizer: "MegatronTokenizer",
-        return_cu_seqlen: bool = True,
-        pad_cu_seqlens: bool = False,
-        pack_metadata_file_path: str | None = None,
-        **kwargs,
-    ):
-        """Initialize the packed Parquet dataset.
-
-        Args:
-            file_path: Path to packed Parquet file(s). Supports:
-                - Single file: "data.idx.parquet"
-                - Glob pattern: "data*.idx.parquet"
-                - Directory: "/path/to/data/"
-            tokenizer: The tokenizer to use.
-            return_cu_seqlen: Whether to return cu_seqlen for THD attention kernel.
-            pad_cu_seqlens: Whether to pad cu_seqlens for cudagraphs compatibility.
-            pack_metadata_file_path: Path to the metadata JSON file for pad_cu_seqlens.
-            **kwargs: Additional arguments passed to parent class.
-        """
-        # Initialize Parquet-specific state before calling parent __init__
-        # (parent calls _load_dataset which needs these)
-        self._file_path_spec: str = file_path  # Original specification (may be glob)
-        self._parquet_paths: list[str] = []  # Resolved list of files
-        self._num_rows: int = 0  # Total rows across all files
-        self._file_offsets: list[int] = []  # Cumulative row counts: [0, rows_file0, rows_file0+rows_file1, ...]
-        self._file_row_group_offsets: list[list[int]] = []  # Row group offsets per file
-
-        # Lazy reader state (opened in worker processes after fork)
-        # Maps file_idx -> (ParquetFile, handle)
-        self._parquet_files: dict[int, tuple] = {}
-        self._cached_file_idx: int | None = None
-        self._cached_row_group_id: int | None = None
-        self._cached_row_group_table = None
-
-        # Call parent __init__ which will call _load_dataset() and _build_samples_mapping()
-        super().__init__(
-            file_path=file_path,
-            tokenizer=tokenizer,
-            return_cu_seqlen=return_cu_seqlen,
-            pad_cu_seqlens=pad_cu_seqlens,
-            pack_metadata_file_path=pack_metadata_file_path,
-            **kwargs,
-        )
-
-    def _load_dataset(self):
-        """Load Parquet metadata from all files and validate schemas.
-
-        This method:
-        1. Resolves the file path specification to actual files
-        2. Reads metadata from each file (not actual data)
-        3. Validates schemas contain required columns
-        4. Builds cumulative indices for efficient row lookups
-
-        The actual Parquet files are opened lazily in _ensure_reader() to survive
-        DataLoader worker forking.
-        """
-        pyarrow, pq = _lazy_import_pyarrow()
-
-        # Resolve file paths
-        self._parquet_paths = _resolve_parquet_paths(self._file_path_spec)
-
-        logger.info(f"Resolved {len(self._parquet_paths)} packed Parquet file(s) from: {self._file_path_spec}")
-
-        # Build cumulative offsets
-        self._file_offsets = [0]
-        self._file_row_group_offsets = []
-
-        for file_idx, parquet_path in enumerate(self._parquet_paths):
-            # Read metadata only (not actual data)
-            if MultiStorageClientFeature.is_enabled():
-                msc = MultiStorageClientFeature.import_package()
-                handle = msc.open(str(parquet_path), "rb")
-                try:
-                    if hasattr(handle, "seekable") and handle.seekable():
-                        metadata = pq.read_metadata(handle)
-                        handle.seek(0)
-                        schema = pq.read_schema(handle)
-                    else:
-                        content = handle.read()
-                        buffer = pyarrow.BufferReader(content)
-                        pf = pq.ParquetFile(buffer)
-                        metadata = pf.metadata
-                        schema = pf.schema_arrow
-                finally:
-                    handle.close()
-            else:
-                metadata = pq.read_metadata(parquet_path)
-                schema = pq.read_schema(parquet_path)
-
-            # Validate schema on every file to catch malformed shards early
-            schema_columns = set(schema.names)
-            missing_columns = REQUIRED_COLUMNS - schema_columns
-            if missing_columns:
-                raise ValueError(
-                    f"Packed Parquet file '{parquet_path}' is missing required columns: {missing_columns}. "
-                    f"Required columns are: {REQUIRED_COLUMNS}. "
-                    f"Found columns: {schema_columns}"
-                )
-
-            # Build row group offsets for this file
-            row_group_offsets = [0]
-            for i in range(metadata.num_row_groups):
-                row_group_offsets.append(row_group_offsets[-1] + metadata.row_group(i).num_rows)
-            self._file_row_group_offsets.append(row_group_offsets)
-
-            # Update cumulative file offset
-            file_rows = metadata.num_rows
-            self._file_offsets.append(self._file_offsets[-1] + file_rows)
-
-            logger.debug(
-                f"  File {file_idx}: {parquet_path}, {file_rows} rows in {metadata.num_row_groups} row groups"
-            )
-
-        self._num_rows = self._file_offsets[-1]
-
-        # Validate dataset is not empty
-        if self._num_rows == 0:
-            raise ValueError(f"Packed Parquet dataset is empty (0 rows) for path: {self._file_path_spec}")
-
-        logger.info(
-            f"Loaded packed Parquet dataset: {self._num_rows} total rows across {len(self._parquet_paths)} file(s)"
-        )
-
-    @staticmethod
-    def validate_row(idx: int, input_ids: list, loss_mask: list, seq_start_id: list) -> None:
-        """Validate packed row invariants.
-
-        This is NOT called in the training hot path for performance reasons.
-        Use it during data preparation or for debugging.
-
-        Args:
-            idx: Row index (for error messages).
-            input_ids: Token IDs for the packed sequence.
-            loss_mask: Per-token loss mask.
-            seq_start_id: Start offsets for each sub-sequence.
-
-        Raises:
-            ValueError: If any invariant is violated.
-        """
-        if len(loss_mask) != len(input_ids):
-            raise ValueError(f"Row {idx}: loss_mask length ({len(loss_mask)}) != input_ids length ({len(input_ids)})")
-
-        if not seq_start_id or seq_start_id[0] != 0:
-            raise ValueError(
-                f"Row {idx}: seq_start_id must start with 0, got {seq_start_id[:5] if seq_start_id else []}"
-            )
-
-        for i, start in enumerate(seq_start_id):
-            if start >= len(input_ids):
-                raise ValueError(f"Row {idx}: seq_start_id[{i}]={start} >= len(input_ids)={len(input_ids)}")
-            if i > 0 and start < seq_start_id[i - 1]:
-                raise ValueError(
-                    f"Row {idx}: seq_start_id is not non-decreasing at index {i}: {seq_start_id[i - 1]} > {start}"
-                )
-
-    def _ensure_reader(self, file_idx: int):
-        """Lazily open a Parquet file for reading.
-
-        Args:
-            file_idx: Index of the file in self._parquet_paths.
-
-        This method is called before accessing data and creates the ParquetFile
-        reader if it doesn't exist. This lazy initialization ensures the reader
-        survives DataLoader worker forking (each worker creates its own readers).
-        """
-        if file_idx in self._parquet_files:
-            return self._parquet_files[file_idx][0]
-
-        pyarrow, pq = _lazy_import_pyarrow()
-        parquet_path = self._parquet_paths[file_idx]
-
-        if MultiStorageClientFeature.is_enabled():
-            msc = MultiStorageClientFeature.import_package()
-            handle = msc.open(str(parquet_path), "rb")
-
-            if hasattr(handle, "seekable") and handle.seekable():
-                pf = pq.ParquetFile(handle)
-                self._parquet_files[file_idx] = (pf, handle)
-            else:
-                # MVP fallback: load entire file into memory for non-seekable streams
-                logger.warning(f"MSC stream is not seekable, loading entire Parquet file into memory: {parquet_path}")
-                content = handle.read()
-                handle.close()
-                buffer = pyarrow.BufferReader(content)
-                pf = pq.ParquetFile(buffer)
-                self._parquet_files[file_idx] = (pf, None)
-        else:
-            pf = pq.ParquetFile(parquet_path)
-            self._parquet_files[file_idx] = (pf, None)
-
-        return self._parquet_files[file_idx][0]
-
-    def close(self) -> None:
-        """Close all open Parquet file handles.
-
-        This method should be called when the dataset is no longer needed to
-        release file handles, especially when using MSC backends. It is also
-        called automatically by __del__.
-        """
-        parquet_files = getattr(self, "_parquet_files", None)
-        if parquet_files is None:
-            return
-
-        for file_idx, (pf, handle) in list(parquet_files.items()):
-            if handle is not None:
-                try:
-                    handle.close()
-                except Exception:
-                    pass  # Best effort cleanup
-            # Also close ParquetFile if it has a close method
-            if hasattr(pf, "close"):
-                try:
-                    pf.close()
-                except Exception:
-                    pass
-
-        self._parquet_files.clear()
-        self._cached_row_group_table = None
-        self._cached_file_idx = None
-        self._cached_row_group_id = None
-
-    def __del__(self) -> None:
-        """Cleanup on deletion."""
-        self.close()
-
-    def _build_samples_mapping(self):
-        """Build epoch-level sample mapping for shuffling.
-
-        Mirrors GPTSFTPackedDataset._build_samples_mapping() exactly,
-        using self._num_rows instead of len(self.indexed_dataset).
-        """
-        if self.max_num_samples is not None:
-            dataset_len = self._num_rows
-            max_num_epochs = np.ceil(self.max_num_samples / dataset_len)
-            indices = np.arange(dataset_len)[None, :].repeat(max_num_epochs, axis=0)
-            [np.random.shuffle(x) for x in indices]
-            self.samples_mapping = indices.reshape(1, -1).squeeze()[: self.max_num_samples]
-        else:
-            self.samples_mapping = None
-
-    def __len__(self):
-        """Return the number of samples in the dataset."""
-        if self.samples_mapping is not None:
-            return len(self.samples_mapping)
-        return self._num_rows
-
-    def _locate_row(self, global_idx: int) -> tuple[int, int, int]:
-        """Map a global row index to (file_idx, row_group_id, row_in_group).
-
-        Args:
-            global_idx: Global row index across all files.
-
-        Returns:
-            Tuple of (file_idx, row_group_id, row_in_group).
-        """
-        # Find which file contains this row
-        file_idx = bisect.bisect_right(self._file_offsets, global_idx) - 1
-        row_in_file = global_idx - self._file_offsets[file_idx]
-
-        # Find which row group within the file
-        row_group_offsets = self._file_row_group_offsets[file_idx]
-        row_group_id = bisect.bisect_right(row_group_offsets, row_in_file) - 1
-        row_in_group = row_in_file - row_group_offsets[row_group_id]
-
-        return file_idx, row_group_id, row_in_group
-
-    def __getitem__(self, idx: int) -> dict:
-        """Get a packed sample by index.
-
-        Args:
-            idx: Sample index. If samples_mapping exists, this is mapped to the
-                actual row index. Negative indices return samples with zeroed loss_mask.
-
-        Returns:
-            dict with keys:
-                - input_ids: list[int] - Token IDs
-                - seq_boundaries: list[int] - Sequence boundaries (derived from seq_start_id)
-                - loss_mask: list[int] - Per-token loss mask
-        """
-        # Apply sample mapping if exists
-        if self.samples_mapping is not None:
-            idx = self.samples_mapping[idx]
-
-        # Handle negative indices (padding samples)
-        # Use wrap-around semantics matching parent GPTSFTPackedDataset behavior
-        is_padding_sample = idx < 0
-        if is_padding_sample:
-            idx = self._num_rows + idx  # -1 -> last row, -N -> Nth from end
-
-        # Locate the row across files and row groups
-        file_idx, row_group_id, row_in_group = self._locate_row(idx)
-
-        # Ensure reader is initialized for this file
-        pf = self._ensure_reader(file_idx)
-
-        # Read row group with caching
-        cache_key = (file_idx, row_group_id)
-        if (self._cached_file_idx, self._cached_row_group_id) != cache_key:
-            self._cached_row_group_table = pf.read_row_group(
-                row_group_id, columns=["input_ids", "seq_start_id", "loss_mask"]
-            )
-            self._cached_file_idx = file_idx
-            self._cached_row_group_id = row_group_id
-
-        # Extract row values
-        table = self._cached_row_group_table
-        input_ids = table.column("input_ids")[row_in_group].as_py()
-        seq_start_id = table.column("seq_start_id")[row_in_group].as_py()
-        loss_mask = table.column("loss_mask")[row_in_group].as_py()
-
-        # Compute derived field
-        seq_boundaries = seq_start_id + [len(input_ids)]
-
-        # For padding samples, zero out the loss mask
-        if is_padding_sample:
-            loss_mask = [0] * len(loss_mask)
-
-        return {
-            "input_ids": input_ids,
-            "seq_boundaries": seq_boundaries,
-            "loss_mask": loss_mask,
-        }
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/finetune.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from megatron.bridge.training.callbacks import Callback, CallbackManager
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.forward_step_func_types import ForwardStepCallable
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.utils.decorators import experimental_fn
-
-
-@experimental_fn
-def finetune(
-    config: ConfigContainer,
-    forward_step_func: ForwardStepCallable,
-    callbacks: list[Callback] | CallbackManager | None = None,
-) -> None:
-    """Main function to run the finetuning.
-
-    Args:
-        config: The main configuration container holding all necessary parameters.
-        forward_step_func: A callable (function or functor) that performs a single
-                          forward and backward step, returning the loss and any computed
-                          metrics. Supports the following signatures:
-                          - 2 args: (data_iterator, model)
-                          - 3 args: (data_iterator, model, return_schedule_plan=False)
-                                   OR (state: GlobalState, data_iterator, model)
-                          - 4 args: (state: GlobalState, data_iterator, model, return_schedule_plan=False)
-        callbacks: Optional list of Callback instances, a CallbackManager, or None.
-
-    Note:
-        Use the signature with GlobalState type hint for full access to configuration, timers, and training state.
-        State injection is automatic based on type hints or parameter names.
-        Functors (classes with __call__) are fully supported.
-
-    Warnings:
-        This is an experimental API and is subject to change in backwards
-        incompatible ways without notice.
-    """
-    assert config.checkpoint.pretrained_checkpoint is not None or config.checkpoint.load is not None, (
-        "Finetuning requires a loading from a pretrained checkpoint or resuming from a checkpoint"
-    )
-    return pretrain(config, forward_step_func, callbacks=callbacks)
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/gpt_step.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from functools import partial
-from typing import Iterable
-
-import modelopt.torch.distill as mtd
-import torch
-from megatron.core import parallel_state
-from megatron.core.models.gpt import GPTModel
-from megatron.core.pipeline_parallel.utils import is_pp_first_stage, is_pp_last_stage
-from megatron.core.utils import (
-    get_batch_on_this_cp_rank,
-    get_model_config,
-    is_te_min_version,
-    unwrap_model,
-)
-
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.losses import masked_next_token_loss
-from megatron.bridge.training.post_training.distillation import loss_func_kd
-from megatron.bridge.training.state import GlobalState
-from megatron.bridge.training.utils.packed_seq_utils import get_packed_seq_params
-from megatron.bridge.training.utils.pg_utils import get_pg_collection
-
-
-logger = logging.getLogger(__name__)
-
-
-def _partition_packed_batch_for_cp(batch: dict[str, torch.Tensor], cp_size: int) -> dict[str, torch.Tensor]:
-    """Partition THD/packed batches across context-parallel ranks.
-
-    Uses transformer_engine's `thd_get_partitioned_indices` to slice sequence
-    dimension aligned with packed cu_seqlens. This avoids the generic
-    `get_batch_on_this_cp_rank` slicing which assumes contiguous sequence tokens.
-    """
-
-    err_msg = "Please update Transformer Engine to >= 1.10 to use Context Parallel with THD format data"
-    try:
-        import transformer_engine_torch as tex
-
-        if not is_te_min_version("1.10.0"):
-            logger.error(err_msg)
-            raise RuntimeError(err_msg)
-    except ModuleNotFoundError as e:
-        logger.error(err_msg)
-        raise e
-
-    cp_rank = parallel_state.get_context_parallel_rank()
-    cu_seqlens = batch["cu_seqlens"]
-    if cu_seqlens.dim() > 1 and cu_seqlens.size(0) != 1:
-        raise ValueError("Packed THD batches expect micro-batch size 1 for context-parallel slicing (THD layout)")
-    cu_seqlens = cu_seqlens.squeeze()
-    cu_seqlens_unpadded = batch.get("cu_seqlens_unpadded")
-    if cu_seqlens_unpadded is not None:
-        batch["cu_seqlens_unpadded"] = cu_seqlens_unpadded.squeeze()
-
-    skip_keys = {
-        "cu_seqlens",
-        "cu_seqlens_unpadded",
-        "cu_seqlens_argmin",
-        "cu_seqlens_unpadded_argmin",
-        "max_seqlen",
-        "token_count",
-    }
-
-    for key, val in batch.items():
-        if val is None or key in skip_keys:
-            continue
-        index = tex.thd_get_partitioned_indices(cu_seqlens, val.size(1), cp_size, cp_rank)
-        batch[key] = val.index_select(1, index)
-
-    return batch
-
-
-def get_batch_from_iterator(
-    data_iterator: Iterable,
-    use_mtp: bool = False,
-    skip_getting_attention_mask_from_dataset: bool = True,
-    *,
-    is_first_pp_stage: bool,
-    is_last_pp_stage: bool,
-) -> dict[str, torch.Tensor]:
-    """Get a batch of data from the iterator.
-
-    Args:
-        data_iterator: The data iterator to get the batch from.
-        use_mtp: Whether Multi-Token Prediction layers are enabled.
-        skip_getting_attention_mask_from_dataset: If set, the dataset will pass a None attention mask.
-
-    Returns:
-        dict[str, torch.Tensor]: A dictionary containing the batch data.
-    """
-    batch = next(data_iterator)
-
-    required_device_keys = set()
-    required_host_keys = set()
-
-    if not skip_getting_attention_mask_from_dataset:
-        required_device_keys.add("attention_mask")
-
-    if "cu_seqlens" in batch:
-        required_device_keys.add("cu_seqlens")
-        if "cu_seqlens_unpadded" in batch:
-            required_device_keys.add("cu_seqlens_unpadded")
-        required_host_keys.add("cu_seqlens_argmin")
-        required_host_keys.add("max_seqlen")
-        if "cu_seqlens_unpadded_argmin" in batch:
-            required_host_keys.add("cu_seqlens_unpadded_argmin")
-
-    if is_first_pp_stage or use_mtp:
-        required_device_keys.update(("tokens", "position_ids"))
-    if is_last_pp_stage:
-        required_device_keys.update(("labels", "loss_mask"))
-
-    _batch_required_keys = {}
-    for key, val in batch.items():
-        if key in required_device_keys:
-            _batch_required_keys[key] = val.cuda(non_blocking=True) if val is not None else None
-        elif key in required_host_keys:
-            _batch_required_keys[key] = val.cpu() if val is not None else None
-        else:
-            _batch_required_keys[key] = None
-
-    return _batch_required_keys
-
-
-def get_batch(
-    data_iterator: Iterable, cfg: ConfigContainer, use_mtp: bool = False, *, pg_collection
-) -> tuple[
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor,
-    torch.Tensor | None,
-    torch.Tensor | None,
-]:
-    """Generate a batch.
-
-    Args:
-        data_iterator: Input data iterator
-        cfg: Configuration container
-        use_mtp: Whether Multi-Token Prediction layers are enabled
-
-    Returns:
-        tuple of tensors containing tokens, labels, loss_mask, attention_mask, position_ids,
-        cu_seqlens, cu_seqlens_argmin, max_seqlen, cu_seqlens_unpadded, and
-        cu_seqlens_unpadded_argmin
-    """
-    # Determine pipeline stage role via process group collection
-    is_first = is_pp_first_stage(pg_collection.pp)
-    is_last = is_pp_last_stage(pg_collection.pp)
-    if (not is_first) and (not is_last):
-        return None, None, None, None, None, None, None, None, None, None
-
-    batch = get_batch_from_iterator(
-        data_iterator,
-        use_mtp,
-        getattr(cfg.dataset, "skip_getting_attention_mask_from_dataset", True),
-        is_first_pp_stage=is_first,
-        is_last_pp_stage=is_last,
-    )
-
-    cp_size = pg_collection.cp.size()
-    has_packed = batch.get("cu_seqlens") is not None
-    if has_packed and cp_size > 1:
-        batch = _partition_packed_batch_for_cp(batch, cp_size)
-    else:
-        # slice batch along sequence dimension for context parallelism
-        batch = get_batch_on_this_cp_rank(batch, cp_group=pg_collection.cp)
-
-    return (
-        batch["tokens"],
-        batch["labels"],
-        batch["loss_mask"],
-        batch.get(
-            "attention_mask"
-        ),  # Attention_mask is optional for pre-training as a casual mask is generated automatically.
-        batch["position_ids"],
-        batch.get("cu_seqlens"),
-        batch.get("cu_seqlens_argmin"),
-        batch.get("max_seqlen"),
-        batch.get("cu_seqlens_unpadded"),
-        batch.get("cu_seqlens_unpadded_argmin"),
-    )
-
-
-def _forward_step_common(
-    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Forward training step.
-
-    Args:
-        state: Global state for the run
-        data_iterator: Input data iterator
-        model: The GPT Model
-        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
-
-    Returns:
-        tuple containing the output tensor and loss mask
-    """
-    timers = state.timers
-    straggler_timer = state.straggler_timer
-
-    config = get_model_config(model)
-    pg_collection = get_pg_collection(model)
-    use_mtp = (getattr(config, "mtp_num_layers", None) or 0) > 0
-
-    timers("batch-generator", log_level=2).start()
-    with straggler_timer(bdata=True):
-        (
-            tokens,
-            labels,
-            loss_mask,
-            attention_mask,
-            position_ids,
-            cu_seqlens,
-            cu_seqlens_argmin,
-            max_seqlen,
-            cu_seqlens_unpadded,
-            cu_seqlens_unpadded_argmin,
-        ) = get_batch(data_iterator, state.cfg, use_mtp, pg_collection=pg_collection)
-    timers("batch-generator").stop()
-
-    forward_args = {
-        "input_ids": tokens,
-        "position_ids": position_ids,
-        "attention_mask": attention_mask,
-        "labels": labels,
-    }
-
-    # Add packed sequence support
-    if cu_seqlens is not None:
-        packed_seq_params = {
-            "cu_seqlens": cu_seqlens,
-            "cu_seqlens_argmin": cu_seqlens_argmin,
-            "max_seqlen": max_seqlen,
-            "cu_seqlens_unpadded": cu_seqlens_unpadded,
-            "cu_seqlens_unpadded_argmin": cu_seqlens_unpadded_argmin,
-        }
-        forward_args["packed_seq_params"] = get_packed_seq_params(packed_seq_params)
-
-    with straggler_timer:
-        if return_schedule_plan:
-            assert config.overlap_moe_expert_parallel_comm, (
-                "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan"
-            )
-            schedule_plan = model.build_schedule_plan(
-                tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask
-            )
-            return schedule_plan, loss_mask
-        else:
-            output_tensor = model(**forward_args)
-
-    return output_tensor, loss_mask
-
-
-def forward_step(
-    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
-) -> tuple[torch.Tensor, partial]:
-    """Forward training step.
-
-    Args:
-        state: Global state for the run
-        data_iterator: Input data iterator
-        model: The GPT Model
-        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
-
-    Returns:
-        tuple containing the output tensor and the loss function
-    """
-    output, loss_mask = _forward_step_common(state, data_iterator, model, return_schedule_plan)
-
-    loss_function = _create_loss_function(
-        loss_mask,
-        check_for_nan_in_loss=state.cfg.rerun_state_machine.check_for_nan_in_loss,
-        check_for_spiky_loss=state.cfg.rerun_state_machine.check_for_spiky_loss,
-    )
-
-    return output, loss_function
-
-
-def _create_loss_function(loss_mask: torch.Tensor, check_for_nan_in_loss: bool, check_for_spiky_loss: bool) -> partial:
-    """Create a partial loss function with the specified configuration.
-
-    Args:
-        loss_mask: Used to mask out some portions of the loss
-        check_for_nan_in_loss: Whether to check for NaN values in the loss
-        check_for_spiky_loss: Whether to check for spiky loss values
-
-    Returns:
-        A partial function that can be called with output_tensor to compute the loss
-    """
-    return partial(
-        masked_next_token_loss,
-        loss_mask,
-        check_for_nan_in_loss=check_for_nan_in_loss,
-        check_for_spiky_loss=check_for_spiky_loss,
-    )
-
-
-def forward_step_modelopt(
-    state: GlobalState, data_iterator: Iterable, model: GPTModel, return_schedule_plan: bool = False
-) -> tuple[torch.Tensor, partial]:
-    """Forward training step with ModelOpt required modifications.
-
-    Args:
-        state: Global state for the run
-        data_iterator: Input data iterator
-        model: The GPT Model
-        return_schedule_plan (bool): Whether to return the schedule plan instead of the output tensor
-
-    Returns:
-        tuple containing the output tensor and the loss function
-    """
-    output, loss_mask = _forward_step_common(state, data_iterator, model, return_schedule_plan)
-
-    loss_function = _create_loss_function_modelopt(
-        loss_mask,
-        model,
-        check_for_nan_in_loss=state.cfg.rerun_state_machine.check_for_nan_in_loss,
-        check_for_spiky_loss=state.cfg.rerun_state_machine.check_for_spiky_loss,
-    )
-
-    return output, loss_function
-
-
-def _create_loss_function_modelopt(
-    loss_mask: torch.Tensor, model: GPTModel, check_for_nan_in_loss: bool, check_for_spiky_loss: bool
-) -> partial:
-    """Create a partial loss function with the specified configuration.
-
-    Kept here for backward compatibility with tests and callers that patch
-    `megatron.bridge.training.gpt_step.masked_next_token_loss`.
-
-    Args:
-        loss_mask: Used to mask out some portions of the loss
-        model: The GPT Model
-        check_for_nan_in_loss: Whether to check for NaN values in the loss
-        check_for_spiky_loss: Whether to check for spiky loss values
-
-    Returns:
-        A partial function that can be called with output_tensor to compute the loss
-    """
-    mnt_loss_func = partial(
-        masked_next_token_loss,
-        loss_mask,
-        check_for_nan_in_loss=check_for_nan_in_loss,
-        check_for_spiky_loss=check_for_spiky_loss,
-    )
-    unwrapped_model = unwrap_model(model)
-    if isinstance(unwrapped_model, mtd.DistillationModel):
-        return partial(loss_func_kd, loss_mask=loss_mask, original_loss_fn=mnt_loss_func, model=unwrapped_model)
-    else:
-        return mnt_loss_func
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/src/megatron/bridge/training/config.py
-```py
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import warnings
-from abc import ABC, abstractmethod
-from dataclasses import MISSING, dataclass, field, fields
-from pathlib import Path
-from typing import Any, Dict, Literal, Optional, Tuple, Union
-
-import torch
-from megatron.core.datasets.gpt_dataset import GPTDatasetConfig as MCoreGPTDatasetConfig
-from megatron.core.distributed import DistributedDataParallelConfig as MCoreDistributedDataParallelConfig
-from megatron.core.optimizer import OptimizerConfig as MCoreOptimizerConfig
-from megatron.core.optimizer import (
-    ParamGroupOverride,
-    ParamKey,
-)
-from megatron.core.process_groups_config import ProcessGroupCollection
-from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
-from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.transformer_config import MLATransformerConfig as MCoreMLATransformerConfig
-from megatron.core.transformer.transformer_config import TransformerConfig as MCoreTransformerConfig
-from megatron.training.config import CheckpointConfig as MTrainCheckpointConfig
-from megatron.training.config import DistributedInitConfig as MTrainDistributedInitConfig
-from megatron.training.config import LoggerConfig as MTrainLoggerConfig
-from megatron.training.config import ProfilingConfig as MTrainProfilingConfig
-from megatron.training.config import RerunStateMachineConfig as MTrainRerunStateMachineConfig
-from megatron.training.config import RNGConfig, ValidationConfig
-from megatron.training.config import SchedulerConfig as MTrainSchedulerConfig
-from megatron.training.config import StragglerDetectionConfig as MTrainStragglerDetectionConfig
-from megatron.training.config import TrainingConfig as MTrainTrainingConfig
-
-from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
-from megatron.bridge.models import GPTModelProvider, T5ModelProvider
-from megatron.bridge.models.gpt.gpt_builder import GPTModelConfig
-from megatron.bridge.models.mamba.mamba_builder import MambaModelConfig
-from megatron.bridge.models.mamba.mamba_provider import MambaModelProvider
-from megatron.bridge.models.mimo.mimo_provider import MimoModelProvider
-from megatron.bridge.peft.base import PEFT
-from megatron.bridge.training.comm_overlap import CommOverlapConfig
-from megatron.bridge.training.flex_dispatcher_backend import validate_flex_dispatcher_backend
-from megatron.bridge.training.mixed_precision import MixedPrecisionConfig, get_mixed_precision_config
-from megatron.bridge.training.tokenizers.config import TokenizerConfig
-from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
-from megatron.bridge.training.utils.config_utils import _ConfigContainerBase as Container
-from megatron.bridge.utils.common_utils import (
-    get_world_size_safe,
-    print_rank_0,
-    warn_rank_0,
-)
-
-
-@dataclass
-class DistributedDataParallelConfig(MCoreDistributedDataParallelConfig):
-    """Megatron Core DistributedDataParallelConfig with deferred post-init.
-
-    This class inherits from Megatron Core's DistributedDataParallelConfig but defers the
-    execution of post_init() until finalize() is explicitly called. This allows
-    for field modifications after construction but before computed fields are calculated.
-    """
-
-    param_name_patterns_for_fp32_local_accumulation: Tuple[str, ...] = ()
-    """fnmatch patterns selecting parameters whose gradients should be locally
-    accumulated in FP32. The special pattern ``'all'`` matches every parameter.
-    Synced from MCore c586f6d56 (#4028); field will be inherited from the base
-    class after the next mcore bump."""
-
-    def __post_init__(self) -> None:
-        """Skip MCore post_init during initial construction.
-
-        The original post_init logic is deferred until finalize() is called.
-        """
-        pass
-
-    def finalize(self) -> None:
-        """Execute the deferred MCore post-init logic.
-
-        This method calls the original Megatron Core DistributedDataParallelConfig.__post_init__()
-        to compute derived fields based on the current field values.
-        """
-        super().__post_init__()
-
-
-@dataclass
-class OptimizerConfig(MCoreOptimizerConfig):
-    """Megatron Core OptimizerConfig with deferred post-init.
-
-    This class inherits from Megatron Core's OptimizerConfig but defers the
-    execution of post_init() until finalize() is explicitly called. This allows
-    for field modifications after construction but before computed fields are calculated.
-    """
-
-    def __post_init__(self) -> None:
-        """Skip MCore post_init during initial construction.
-
-        The original post_init logic is deferred until finalize() is called.
-        """
-        pass
-
-    def finalize(self) -> None:
-        """Execute the deferred MCore post-init logic.
-
-        This method calls the original Megatron Core OptimizerConfig.__post_init__()
-        to compute derived fields based on the current field values.
-        """
-        super().__post_init__()
-
-
-@dataclass(kw_only=True)
-class DistributedInitConfig(MTrainDistributedInitConfig):
-    """Configuration settings for distributed training initialization."""
-
-    external_gpu_device_mapping: bool = False
-    """If True, indicates that GPU device mapping has been externally managed
-    (e.g., via CUDA_VISIBLE_DEVICES environment variable). When True, uses device 0
-    instead of local rank for CUDA device selection. This is useful when launching
-    with external process managers that handle GPU visibility.
-    """
-
-    enable_megatron_core_experimental: bool = False
-    """Enable experimental features for Megatron Core."""
-
-    use_decentralized_pg: bool = False
-    """Use ProcessGroupCollection passed through functions instead of relying on mcore's
-    global parallel state (mpu) variables. When True, parallel groups are obtained from
-    the pg_collection object rather than the global megatron.core.parallel_state module."""
-
-    @property
-    def lazy_init(self) -> bool:
-        return self.lazy_mpu_init
-
-    @lazy_init.setter
-    def lazy_init(self, value: bool) -> None:
-        self.lazy_mpu_init = value
-
-
-@dataclass(kw_only=True)
-class RerunStateMachineConfig(MTrainRerunStateMachineConfig):
-    """Configuration for the rerun state machine used for result validation or stats."""
-
-    rerun_mode: Literal["disabled", "validate_results", "report_determinism_stats"] = "disabled"
-    """Use re-run engine to validate results (default) or to emit stats
-    on variability of computations due to non-deterministic algorithms."""
-
-    spiky_loss_factor: float = 10.0
-    """Factor for detecting spiky loss. A loss is considered spiky if it exceeds
-    this multiple of the max observed loss over the sample window."""
-
-
-@dataclass(kw_only=True)
-class DataloaderConfig:
-    """Base configuration for data loading."""
-
-    dataloader_type: Optional[Literal["single", "cyclic", "batch", "external"]] = None
-    """Dataloader type: 'single' for single pass, 'cyclic' for multiple passes with shuffling,
-    'batch' for global batch sampling (used in fine-tuning), or 'external' for custom dataloaders."""
-
-    num_workers: int = 2
-    """Dataloader number of workers."""
-
-    data_sharding: bool = True
-    """Disable data sharding."""
-
-    pin_memory: bool = True
-    """Whether to pin memory during data loading for faster GPU training."""
-
-    drop_last: bool = True
-    """Whether to drop the last incomplete batch."""
-
-    persistent_workers: bool = True
-    """Whether to keep data loading workers persistent across epochs.
-    Automatically set to False when num_workers is 0."""
-
-    trust_remote_code: Optional[bool] = None
-    """Whether remote code execution should be trusted for a given HF path."""
-
-    def finalize(self):
-        """Finalize dataloader config field constraints."""
-        if self.num_workers == 0 and self.persistent_workers:
-            self.persistent_workers = False
-
-
-@dataclass(frozen=True)
-class DatasetBuildContext:
-    """Interface that encapsulates framework internals.
-
-    This context provides metadata needed to build datasets
-    while hiding implementation details of the framework.
-
-    Attributes:
-        train_samples: Number of samples for training dataset
-        valid_samples: Number of samples for validation dataset
-        test_samples: Number of samples for test dataset
-        tokenizer: Optional tokenizer instance for text processing
-        pg_collection: Optional process group collection for distributed training
-    """
-
-    train_samples: int
-    valid_samples: int
-    test_samples: int
-    tokenizer: Optional[MegatronTokenizer] = None
-    pg_collection: Optional[ProcessGroupCollection] = None
-
-
-@dataclass(frozen=True)
-class OptimizerConfigOverrideProviderContext:
-    """Context for providing config overrides."""
-
-    scheduler_config: "SchedulerConfig"
-    optimizer_config: OptimizerConfig
-    model: Union[MegatronModule, list[MegatronModule]]
-
-
-@dataclass
-class OptimizerConfigOverrideProvider:
-    """Abstract base class for providing config overrides."""
-
-    def build_config_overrides(
-        self, context: OptimizerConfigOverrideProviderContext
-    ) -> dict[ParamKey, ParamGroupOverride] | None:
-        """Build config overrides for weight decay based on scheduler configuration.
-
-        This function creates parameter-specific overrides for weight decay behavior.
-        By default, weight decay is skipped for bias parameters and 1D parameters.
-        For Qwen3-Next models, weight decay is applied to q_layernorm and k_layernorm.
-
-        Args:
-            context: OptimizerConfigOverrideProviderContext which packages the scheduler
-                configuration, optimizer configuration, and model.
-
-        Returns:
-            Dictionary of ParamKey to ParamGroupOverride for the optimizer
-        """
-        model = context.model
-        scheduler_config = context.scheduler_config
-        optimizer_config = context.optimizer_config
-
-        config_overrides: dict[ParamKey, ParamGroupOverride] = {}
-
-        # Collect param names that should skip weight decay
-        # NOTE: this can be simplified once https://github.com/NVIDIA/Megatron-LM/pull/2753
-        #  is merged into dev. Then we can re-use megatron's apply_wd_to_qk_layernorm option
-        #  and call megatron.core.optimizer.get_standard_config_overrides(optimizer_config)
-        #  directly for standard settings, replacing the custom logic below for qwen3-next.
-        no_wd_names: list[str] = []
-        is_qwen3_next = scheduler_config.no_weight_decay_cond_type == "qwen3_next"
-
-        model_list = model if isinstance(model, list) else [model]
-        for model_chunk in model_list:
-            for name, param in model_chunk.named_parameters():
-                # Skip weight decay for bias parameters
-                if name.endswith(".bias"):
-                    no_wd_names.append(name)
-                    continue
-
-                # Skip weight decay for 1D parameters
-                if len(param.shape) == 1:
-                    if is_qwen3_next:
-                        # Qwen3-Next: apply weight decay to qk layernorm (don't add to skip list)
-                        if "q_layernorm" in name or "k_layernorm" in name:
-                            continue
-                    no_wd_names.append(name)
-
-        # Create a single ParamKey with all names that should skip weight decay
-        if no_wd_names:
-            no_wd_key = ParamKey(name=tuple(no_wd_names))
-            config_overrides[no_wd_key] = ParamGroupOverride(wd_mult=0.0)
-
-        # Now handle decoupled LR:
-        if optimizer_config.decoupled_lr is not None:
-            decoupled_lr_config: ParamGroupOverride = {"max_lr": optimizer_config.decoupled_lr}
-            decoupled_param_key = ParamKey(attr="is_embedding_or_output_parameter")
-            if optimizer_config.decoupled_min_lr is not None:
-                decoupled_lr_config["min_lr"] = optimizer_config.decoupled_min_lr
-            config_overrides[decoupled_param_key] = decoupled_lr_config
-
-        return config_overrides if config_overrides else None
-
-
-@dataclass
-class DatasetProvider(DataloaderConfig, ABC):
-    """Abstract base class for custom dataset configurations.
-
-    Provides an interface for users to implement their own dataset builders
-    while automatically inheriting all DataloaderConfig functionality.
-
-    Users must:
-    1. Inherit from this class
-    2. Implement the build_datasets() method
-
-    Example:
-        @dataclass
-        class S3DatasetConfig(DatasetProvider):
-            bucket_name: str
-            data_prefix: str
-            seq_length: int
-
-            def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]:
-                # Custom implementation to load data from S3
-                train_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/train", context.tokenizer)
-                valid_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/valid", context.tokenizer)
-                test_ds = load_s3_dataset(self.bucket_name, f"{self.data_prefix}/test", context.tokenizer)
-                return train_ds, valid_ds, test_ds
-    """
-
-    @abstractmethod
-    def build_datasets(self, context: DatasetBuildContext) -> Tuple[Optional[Any], Optional[Any], Optional[Any]]:
-        """Build train, validation, and test datasets.
-
-        This method is called by the framework during dataset initialization.
-        Implementations should use the provided context to create appropriate
-        datasets for each split.
-
-        Args:
-            context: Build context with sample counts and tokenizer
-
-        Returns:
-            Tuple of (train_dataset, valid_dataset, test_dataset)
-            Any element can be None if that split shouldn't be created.
-
-        Raises:
-            NotImplementedError: Must be implemented by subclasses
-        """
-        pass
-
-
-@dataclass
-class GPTDatasetConfig(MCoreGPTDatasetConfig, DataloaderConfig):
-    """Megatron Core GPTDatasetConfig with deferred post-init.
-
-    This class inherits from MCore's GPTDatasetConfig and DataloaderConfig but defers the
-    execution of post_init() until finalize() is explicitly called. This allows
-    for field modifications after construction but before computed fields are calculated.
-    """
-
-    data_path: str | list[str] | None = None
-    """CLI-friendly alternative to ``blend``.  Accepts a single path string,
-    a space-separated multi-path string, or a list of paths (with optional
-    interleaved weights, matching Megatron-LM ``--data-path`` semantics).
-    Converted to ``blend`` automatically during ``finalize()``."""
-
-    def __init__(
-        self,
-        seq_length: int | None = None,
-        skip_getting_attention_mask_from_dataset: bool = True,
-        data_path: str | list[str] | None = None,
-        *args,
-        **kwargs,
-    ):
-        """
-        Args:
-            seq_length (int | None): the sequence length. If not provided, `sequence_length` must be in kwargs.
-            skip_getting_attention_mask_from_dataset (bool): if set, the dataset will pass a None attention mask
-                and the attention mask is autogenerated from the attn backend.
-            data_path: CLI-friendly data path(s). Converted to ``blend`` in ``finalize()``.
-        """
-        self.skip_getting_attention_mask_from_dataset = skip_getting_attention_mask_from_dataset
-        self.data_path = data_path
-
-        if seq_length is not None:
-            kwargs["sequence_length"] = seq_length
-        elif "sequence_length" not in kwargs:
-            raise ValueError("Either `seq_length` or `sequence_length` must be provided.")
-
-        dataloader_kwargs = {k: kwargs.pop(k) for k in list(kwargs) if k in DataloaderConfig.__dataclass_fields__}
-        MCoreGPTDatasetConfig.__init__(self, *args, **kwargs)
-        DataloaderConfig.__init__(self, **dataloader_kwargs)
-
-    def __post_init__(self) -> None:
-        """Skip MCore post_init during initial construction.
-
-        The original post_init logic is deferred until finalize() is called.
-        """
-        pass
-
-    @property
-    def seq_length(self):
-        """Alias for MCore's `sequence_length` field."""
-        return getattr(self, "sequence_length", None)
-
-    @seq_length.setter
-    def seq_length(self, value):
-        setattr(self, "sequence_length", value)
-
-    def finalize(self) -> None:
-        """Execute the deferred MCore post-init logic and Bridge-specific checks.
-
-        This method calls the original Megatron Core GPTDatasetConfig.__post_init__()
-        and then performs Bridge-specific validation.
-        """
-        if self.blend is None and self.data_path is not None:
-            from megatron.core.datasets.utils import get_blend_from_list
-
-            if isinstance(self.data_path, str):
-                paths = self.data_path.split()
-            else:
-                paths = list(self.data_path)
-            self.blend = get_blend_from_list(paths)
-
-        # Call MCore's post_init
-        super(MCoreGPTDatasetConfig, self).__post_init__()
-
-        assert self.reset_position_ids is not None, "reset_position_ids must be defined."
-        assert self.reset_attention_mask is not None, "reset_attention_mask must be defined."
-        assert self.eod_mask_loss is not None, "eod_mask_loss must be defined."
-
-        DataloaderConfig.finalize(self)
-
-
-@dataclass
-class GPTFIMDatasetConfig(GPTDatasetConfig):
-    """Configuration object forGPT FIM datasets"""
-
-    def __init__(
-        self,
-        fim_rate: float = None,
-        fim_spm_rate: float = None,
-        fim_extra_tokens: Dict = None,
-        fim_split_sample: Optional[str] = None,
-        fim_fragment_rate: Optional[float] = None,
-        fim_no_prefix: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            fim_rate: float: probability to convert a training sample into a FIM format.
-            fim_spm_rate (float): probability that the a FIM sample uses the SPM format over the PSM format.
-            fim_extra_tokens (Dict): should consist of prefix, middle, suffix, PAD, and EOD tokens.
-            fim_split_sample (str): string around which to split the sample for FIM.
-            fim_fragment_rate (float): rate of FIM on each fragment when split_sample is not None.
-            fim_no_prefix (str): do not apply FIM to fragments that start with this prefix.
-        """
-        self.fim_data = True
-        self.fim_rate = fim_rate
-        self.fim_spm_rate = fim_spm_rate
-        self.fim_extra_tokens = fim_extra_tokens
-        self.fim_split_sample = fim_split_sample
-        self.fim_fragment_rate = fim_fragment_rate
-        self.fim_no_prefix = fim_no_prefix
-
-        super().__init__(**kwargs)
-
-
-@dataclass
-class MockGPTDatasetConfig(GPTDatasetConfig):
-    """Modifies GPTDatasetConfig to enforce necessary options for creating a mock dataset."""
-
-    def __init__(
-        self,
-        seq_length: int,
-        **kwargs,
-    ):
-        super().__init__(seq_length=seq_length, **kwargs)
-
-    def finalize(self):
-        """ """
-        # Raise TypeError if `blend` or `blend_per_split` is not None
-        if self.__dict__.get("blend", None):
-            raise TypeError("got an unexpected keyword argument 'blend'")
-        if self.__dict__.get("blend_per_split", None):
-            raise TypeError("got an unexpected keyword argument 'blend_per_split'")
-        if self.__dict__.get("blend", None) and self.__dict__.get("blend_per_split", None):
-            raise TypeError("got an unexpected keyword argument")
-
-        # Drop `blend` and `blend_per_split` from __dict__
-        self.__dict__.pop("blend", None)
-        self.__dict__.pop("blend_per_split", None)
-
-        return super().finalize()
-
-
-@dataclass(kw_only=True)
-class FinetuningDatasetConfig(DataloaderConfig):
-    """Configuration specific to finetuning datasets, inheriting from DataloaderConfig.
-
-    Note: For fine-tuning, dataloader_type defaults to 'batch' which ensures sequences
-    within each global batch are padded to the same length.
-    """
-
-    dataloader_type: Optional[Literal["single", "cyclic", "batch", "external"]] = "batch"
-    """Dataloader type for fine-tuning. Defaults to 'batch' for optimal padding behavior."""
-
-    dataset_root: Optional[Union[str, Path]] = None
-    seq_length: int
-    seed: int = 1234
-    memmap_workers: int = 1
-    max_train_samples: Optional[int] = None
-    packed_sequence_specs: Optional[PackedSequenceSpecs] = None
-    dataset_kwargs: Optional[dict[str, Any]] = None
-    do_validation: bool = True
-    do_test: bool = True
-
-
-@dataclass(kw_only=True)
-class SchedulerConfig(MTrainSchedulerConfig):
-    """Configuration settings for the learning rate scheduler and weight decay."""
-
-    def finalize(self) -> None:
-        """Post-initialization checks for scheduler config."""
-        if self.start_weight_decay is not None:
-            assert self.start_weight_decay >= 0.0, "start_weight_decay should be positive."
-            assert self.end_weight_decay >= self.start_weight_decay
-
-        if self.override_opt_param_scheduler:
-            assert not self.use_checkpoint_opt_param_scheduler, "both override and use-checkpoint are set."
-
-        # Validate mutual exclusivity between iteration-based and sample-based scheduler fields
-        has_iter_fields = (
-            self.lr_decay_iters is not None or self.lr_warmup_iters != 0 or self.lr_wsd_decay_iters is not None
-        )
-        has_sample_fields = (
-            self.lr_decay_samples is not None or self.lr_warmup_samples != 0 or self.lr_wsd_decay_samples is not None
-        )
-
-        assert not (has_iter_fields and has_sample_fields), (
-            f"Cannot mix iteration-based and sample-based scheduler fields. "
-            f"Found iteration fields: lr_decay_iters={self.lr_decay_iters}, lr_warmup_iters={self.lr_warmup_iters}, lr_wsd_decay_iters={self.lr_wsd_decay_iters}. "
-            f"Found sample fields: lr_decay_samples={self.lr_decay_samples}, lr_warmup_samples={self.lr_warmup_samples}, lr_wsd_decay_samples={self.lr_wsd_decay_samples}. "
-            f"Use either iteration fields OR sample fields, not both."
-        )
-
-        # Validate mutual exclusivity between lr_warmup_fraction and specific warmup fields
-        if self.lr_warmup_fraction is not None:
-            assert self.lr_warmup_iters == 0 and self.lr_warmup_samples == 0, (
-                f"Cannot specify lr_warmup_fraction={self.lr_warmup_fraction} with lr_warmup_iters={self.lr_warmup_iters} or lr_warmup_samples={self.lr_warmup_samples}. "
-                f"Use either lr_warmup_fraction OR lr_warmup_iters OR lr_warmup_samples."
-            )
-
-
-@dataclass(kw_only=True)
-class TrainingConfig(MTrainTrainingConfig):
-    """Configuration settings related to the training loop and validation."""
-
-    check_optimizer_step_success: bool = True
-    """Checks optimizer.step() succeeded at each training step ."""
-
-    skip_sync_grad_norm_across_mp: bool = False
-    """Skips syncing the grad norm across the model parallel group."""
-
-    # ---------------- Validation config. ----------------
-
-    eval_iters: int | None = None
-    """Number of iterations to run for evaluation validation/test for. Deprecated in favor of ValidationConfig."""
-
-    eval_interval: int | None = None
-    """Interval between running evaluation on validation set. Deprecated in favor of ValidationConfig."""
-
-    skip_train: bool | None = None
-    """If set, bypass the training loop, optionally do evaluation for validation/test, and exit. Deprecated in favor of ValidationConfig."""
-
-    def finalize(self) -> None:
-        """Validate training mode specification and calculate train_iters from train_samples if needed."""
-        has_train_iters = self.train_iters is not None
-        has_train_samples = self.train_samples is not None
-
-        assert has_train_iters or has_train_samples, "Either train_iters or train_samples must be provided"
-        assert not (has_train_iters and has_train_samples), "Cannot specify both train_iters and train_samples"
-        if has_train_samples:
-            assert self.train_samples > 0, "train_samples must be positive"
-            assert self.rampup_batch_size is None, "Batch size rampup not supported with sample-based training yet"
-
-            # Calculate train_iters from train_samples (rampup_batch_size already validated as None)
-            self.train_iters = self.train_samples // self.global_batch_size
-            print_rank_0(f"Setting training iterations to {self.train_iters} based on {self.train_samples} samples")
-
-
-@dataclass(kw_only=True)
-class CheckpointConfig(MTrainCheckpointConfig):
-    """Configuration settings for model checkpointing (saving and loading)."""
-
-    pretrained_checkpoint: Optional[str] = None
-    """Directory containing a pretrained model checkpoint for finetuning.
-
-    This can be either:
-      - A parent checkpoint directory (e.g. ``/checkpoints/my_model/``) that
-        contains tracker files (``latest_train_state.pt``) and ``iter_*``
-        subdirectories.
-      - A specific iteration directory (e.g.
-        ``/checkpoints/my_model/iter_0001000/``) that directly contains the
-        checkpoint payload (``run_config.yaml``, weight shards, etc.).
-    """
-
-    storage_writers_per_rank: int = 1
-    """Number of storage writers per rank for torch_dist checkpoint format.
-    Affects the number of checkpoint files: saving_ranks * storage_writers_per_rank."""
-
-    use_persistent_ckpt_worker: bool = True
-    """Use a persistent background worker for async checkpoint saves. When enabled, creates a dedicated
-    worker thread/process for handling async saves. When disabled, uses temporal workers that are
-    created and destroyed for each save operation."""
-
-    async_strategy: str = "nvrx"
-    """Async checkpoint strategy to use. Options: ``"nvrx"`` (default) or ``"mcore"``.
-    The ``"nvrx"`` strategy uses nvidia_resiliency_ext for async checkpointing and falls back
-    to ``"mcore"`` if the package is not installed."""
-
-    async_write_results_mp_mode: str = "fork"
-    """Multiprocessing start method for the async write results queue.
-    Options: ``"fork"`` (default), ``"spawn"``, ``"forkserver"``."""
-
-    strict_fsdp_dtensor_load: bool = False
-    """Whether to enforce strict loading for FSDP DTensor checkpoints. When False, allows partial loading."""
-
-    custom_manager_class: str | None = None
-    """Fully qualified class name for a custom CheckpointManager implementation.
-
-    When set, checkpoint operations will instantiate and delegate to this class instead of the default
-    checkpoint manager. The custom class must implement the `CheckpointManager` protocol
-    defined in `megatron.bridge.training.checkpointing`.
-
-    Example: ``'mypackage.checkpoint.MyCheckpointManager'``
-    """
-
-    def finalize(self) -> None:
-        """Post-initialization checks for checkpoint config."""
-        if self.pretrained_checkpoint is not None:
-            from megatron.bridge.training.utils.checkpoint_utils import file_exists
-
-            assert file_exists(self.pretrained_checkpoint), (
-                f"Pretrained checkpoint {self.pretrained_checkpoint} does not exist"
-            )
-
-        if self.load_main_params_from_ckpt:
-            assert not self.load_optim, "load_main_params_from_ckpt must be used with load_optim=False"
-
-        if self.async_save:
-            assert self.save is not None, "async_save is enabled, but save is not set. Set save to a valid path."
-            assert self.use_persistent_ckpt_worker, "async_save requires use_persistent_ckpt_worker=True."
-
-        # Validate ckpt_step if specified
-        if self.ckpt_step is not None:
-            if self.load is None:
-                raise ValueError(
-                    f"ckpt_step={self.ckpt_step} specified but checkpoint.load is None. "
-                    f"Please set checkpoint.load to the base checkpoint directory."
-                )
-
-        if self.dist_ckpt_optim_fully_reshardable:
-            assert not self.distrib_optim_fully_reshardable_mem_efficient, (
-                "distrib_optim_fully_reshardable_mem_efficient requires use_gloo_process_groups"
-            )
-
-
-@dataclass(kw_only=True)
-class LoggerConfig(MTrainLoggerConfig):
-    """Configuration settings for logging, including TensorBoard and WandB."""
-
-    skip_train_metrics_log: bool = False
-    """Skips logging of training metrics to all logging backends and to the console as well."""
-
-    timing_log_level: Literal[-1, 0, 1, 2] = 0
-    """Granularity level to measure and report timing.
-    -1: To disable timing logging as the timer start from 0 and above.
-    0: report only iteration time and make sure timing does not introduce extra overhead.
-    1: report timing for operations that are executed very limited times (basically once) during each iteration
-        (such as gradient all-reduce)
-    2: report timing for operations that migh be executed numerous times during each iteration.
-    Note that setting the level to 1 or 2 might cause increase in iteration time.
-    """
-
-    mlflow_experiment: Optional[str] = None
-    """The MLFlow experiment name."""
-
-    mlflow_run_name: Optional[str] = None
-    """The MLFlow run name."""
-
-    mlflow_tracking_uri: Optional[str] = None
-    """Optional MLFlow tracking URI."""
-
-    mlflow_tags: Optional[dict[str, str]] = None
-    """Optional tags to apply to the MLFlow run."""
-
-    comet_project: Optional[str] = None
-    """The Comet ML project name. Comet logging is disabled when this is None."""
-
-    comet_experiment_name: Optional[str] = None
-    """The Comet ML experiment name."""
-
-    comet_workspace: Optional[str] = None
-    """The Comet ML workspace. If not set, uses the default workspace for the API key."""
-
-    comet_api_key: Optional[str] = None
-    """The Comet ML API key. Can also be set via COMET_API_KEY environment variable."""
-
-    comet_tags: Optional[list[str]] = None
-    """Optional list of tags to apply to the Comet ML experiment."""
-
-    logging_level: int = logging.INFO
-    """Set default logging level"""
-
-    def finalize(self) -> None:
-        """Validate logger settings and optional MLFlow dependency."""
-        if self.mlflow_experiment and (self.mlflow_run_name is None or self.mlflow_run_name == ""):
-            raise ValueError("Set logger.mlflow_run_name when enabling MLFlow logging.")
-
-        using_mlflow = any(
-            [
-                self.mlflow_experiment,
-                self.mlflow_run_name,
-                self.mlflow_tracking_uri,
-                self.mlflow_tags,
-            ]
-        )
-
-        if using_mlflow:
-            try:
-                import importlib
-
-                importlib.import_module("mlflow")
-            except ModuleNotFoundError as exc:
-                raise ModuleNotFoundError(
-                    "MLFlow logging is configured, but the 'mlflow' package is not installed. "
-                    "Install it via pip install mlflow or uv add mlflow"
-                ) from exc
-
-        if self.comet_project and (self.comet_experiment_name is None or self.comet_experiment_name == ""):
-            raise ValueError("Set logger.comet_experiment_name when enabling Comet ML logging.")
-
-        using_comet = any(
-            [
-                self.comet_project,
-                self.comet_experiment_name,
-                self.comet_workspace,
-                self.comet_api_key,
-                self.comet_tags,
-            ]
-        )
-
-        if using_comet:
-            try:
-                import importlib
-
-                importlib.import_module("comet_ml")
-            except ModuleNotFoundError as exc:
-                raise ModuleNotFoundError(
-                    "Comet ML logging is configured, but the 'comet_ml' package is not installed. "
-                    "Install it via pip install comet-ml or uv add comet-ml"
-                ) from exc
-
-
-@dataclass(kw_only=True)
-class ProfilingConfig(MTrainProfilingConfig):
-    """Configuration settings for profiling the training process."""
-
-    def finalize(self) -> None:
-        """Validate profiling configuration."""
-        assert not (self.use_pytorch_profiler and self.use_nsys_profiler), (
-            "Exactly one of pytorch or nsys profiler should be enabled, not both, when ProfilingConfig is active."
-        )
-        assert self.profile_step_start >= 0, f"profile_step_start must be >= 0, got {self.profile_step_start}"
-        assert self.profile_step_end >= 0, f"profile_step_end must be >= 0, got {self.profile_step_end}"
-        assert self.profile_step_end >= self.profile_step_start, (
-            f"profile_step_end ({self.profile_step_end}) must be >= profile_step_start ({self.profile_step_start})"
-        )
-
-
-@dataclass(kw_only=True)
-class TensorInspectConfig:
-    """Configuration for Nvidia-DL-Framework-Inspect integration."""
-
-    enabled: bool = False
-    """Enable tensor inspection and statistics collection."""
-
-    features: dict[str, Any] | str | Path | None = None
-    """Feature configuration as a Python dict or a YAML file path."""
-
-    feature_dirs: list[str] | None = None
-    """Directories containing feature implementations (searched recursively)."""
-
-    log_dir: str | None = None
-    """Root directory to store inspection logs/statistics. Defaults to checkpoint save dir if unset."""
-
-    init_training_step: int = 0
-    """Initial training step for the inspector (used when resuming)."""
-
-    def finalize(self) -> None:
-        """Populate sensible defaults when inspection is enabled.
-
-        - If feature_dirs is unset, default to the installed TransformerEngine
-          debug features package path (transformer_engine.debug.features), when available.
-        """
-        if not self.enabled:
-            return
-        if not self.feature_dirs:
-            try:
-                import importlib
-
-                te_features_mod = importlib.import_module("transformer_engine.debug.features")
-                te_features_dir = Path(te_features_mod.__file__).parent
-                if te_features_dir.exists():
-                    self.feature_dirs = [str(te_features_dir)]
-            except Exception:
-                pass
-
-
-@dataclass
-class FaultToleranceConfig:
-    """Configuration settings related to fault tolerance mechanisms (NVIDIA internal use)."""
-
-    enable_ft_package: bool = False
-    """If set, Fault Tolerance package is enabled. Note: This feature is for Nvidia internal use only."""
-
-    calc_ft_timeouts: bool = False
-    """If set, FT package will try to automatically compute the timeouts.
-    Note: This feature is for Nvidia internal use only.
-    """
-
-    simulate_fault: bool = False
-    """Sets a simulated fault for fault tolerance. NOTE: This if for fault tolerance testing only."""
-
-    simulated_fault_type: Literal["rank_hung", "rank_killed", "random"] = "random"
-    """How the simulated fault should behave. 'random' will randomly choose one of the other two options."""
-
-    simulated_fault_rank: Optional[int] = None
-    """Rank on which simulated fault should occur."""
-
-    simulated_fault_base_delay: int = 0
-    """Base delay before simulated fault thread is started. A small random delay is added to this."""
-
-
-@dataclass(kw_only=True)
-class StragglerDetectionConfig(MTrainStragglerDetectionConfig):
-    """Configuration settings for detecting and logging GPU stragglers."""
-
-    enable_straggler_on_startup: bool = True
-    """If set, StragglerDetector is enabled on startup."""
-
-
-@dataclass
-class NVRxStragglerDetectionConfig:
-    """Configuration settings for NVIDIA Resiliency Extension straggler detection."""
-
-    enabled: bool = False
-    """Enable NVRx straggler detection."""
-
-    report_time_interval: float = 300.0
-    """Interval [seconds] of the straggler check."""
-
-    calc_relative_gpu_perf: bool = True
-    """Calculate relative GPU performance scores."""
-
-    calc_individual_gpu_perf: bool = True
-    """Calculate individual GPU performance scores."""
-
-    num_gpu_perf_scores_to_print: int = 5
-    """How many best and worst perf scores to print (0 - does not print periodically,
-    but only if stragglers are detected)."""
-
-    gpu_relative_perf_threshold: float = 0.7
-    """Threshold for relative GPU performance scores."""
-
-    gpu_individual_perf_threshold: float = 0.7
-    """Threshold for individual GPU performance scores."""
-
-    stop_if_detected: bool = False
-    """Set to True, to terminate the workload if stragglers are detected."""
-
-    enable_logging: bool = True
-    """Set to True, to log GPU performance scores."""
-
-    profiling_interval: int = 1
-    """Profiling interval passed to straggler.Detector.initialize."""
-
-    logger_name: str = "megatron.bridge.NVRxStragglerDetection"
-    """Logger name for straggler detection messages."""
-
-    def finalize(self) -> None:
-        """Validate NVRx straggler detection configuration."""
-        if self.enabled:
-            if not (self.calc_relative_gpu_perf or self.calc_individual_gpu_perf):
-                raise ValueError(
-                    "At least one of calc_relative_gpu_perf or calc_individual_gpu_perf must be True "
-                    "when NVRx straggler detection is enabled."
-                )
-            if self.report_time_interval <= 0:
-                raise ValueError("report_time_interval must be positive.")
-            if not (0.0 <= self.gpu_relative_perf_threshold <= 1.0):
-                raise ValueError("gpu_relative_perf_threshold must be between 0.0 and 1.0.")
-            if not (0.0 <= self.gpu_individual_perf_threshold <= 1.0):
-                raise ValueError("gpu_individual_perf_threshold must be between 0.0 and 1.0.")
-
-
-@dataclass
-class InProcessRestartConfig:
-    """Configuration settings for NVIDIA Resiliency Extension in-process restart functionality."""
-
-    enabled: bool = False
-    """Enable in-process restart mechanism from nvidia-resiliency-ext."""
-
-    max_iterations: Optional[int] = None
-    """Maximum number of in-process restart iterations."""
-
-    monitor_thread_interval: float = 1.0
-    """Monitoring interval (in seconds) for the monitoring thread."""
-
-    monitor_process_interval: float = 1.0
-    """Monitoring interval (in seconds) for the monitoring process."""
-
-    progress_watchdog_interval: float = 1.0
-    """Interval (in seconds) for automatic progress watchdog timestamp updates."""
-
-    heartbeat_interval: float = 30.0
-    """Monitoring interval (in seconds) for detecting unresponsive ranks."""
-
-    soft_timeout: float = 60.0
-    """Soft progress timeout (in seconds)."""
-
-    hard_timeout: float = 90.0
-    """Hard progress timeout (in seconds)."""
-
-    heartbeat_timeout: float = 60.0
-    """Timeout (in seconds) for a missing rank detection heartbeat."""
-
-    barrier_timeout: float = 120.0
-    """Timeout (in seconds) for internal distributed barrier."""
-
-    completion_timeout: float = 120.0
-    """Timeout (in seconds) for barrier on completion on all ranks."""
-
-    last_call_wait: float = 1.0
-    """Time interval (in seconds) for other ranks to report concurrent terminal failures."""
-
-    termination_grace_time: float = 1.0
-    """Interval (in seconds) between SIGTERM and SIGKILL issued on hard timeout."""
-
-    granularity: Literal["node", "rank"] = "node"
-    """Granularity for in-process restart."""
-
-    active_world_size: Optional[int] = None
-    """The number of ranks initially executing the workload.
-    The remaining ranks from the allocation are set aside as warm reserve.
-    If None, defaults to WORLD_SIZE environment variable."""
-
-    empty_cuda_cache: bool = True
-    """Empty CUDA cache during restart finalization."""
-
-    max_rank_faults: Optional[int] = None
-    """Maximum number of rank faults allowed before terminating the job."""
-
-    monitor_process_logdir: Optional[str] = None
-    """Directory for monitor process log files. If None, monitor process logging is disabled."""
-
-
-# ---------------- Container config (standalone top-level config) ----------------
-@dataclass(kw_only=True)
-class ConfigContainer(Container):
-    """Top-level container holding all configuration objects."""
-
-    rng: RNGConfig = field(default_factory=RNGConfig)
-    rerun_state_machine: RerunStateMachineConfig = field(default_factory=RerunStateMachineConfig)
-    train: TrainingConfig
-    model: (
-        GPTModelProvider | T5ModelProvider | MambaModelProvider | MimoModelProvider | GPTModelConfig | MambaModelConfig
-    )
-    optimizer: OptimizerConfig
-    optimizer_config_override_provider: OptimizerConfigOverrideProvider = field(
-        default_factory=OptimizerConfigOverrideProvider
-    )
-    ddp: DistributedDataParallelConfig = field(default_factory=DistributedDataParallelConfig)
-    validation: ValidationConfig = field(default_factory=ValidationConfig)
-    scheduler: SchedulerConfig
-    dataset: GPTDatasetConfig | FinetuningDatasetConfig | DatasetProvider
-    logger: LoggerConfig
-    tokenizer: TokenizerConfig
-    checkpoint: CheckpointConfig
-    dist: DistributedInitConfig = field(default_factory=DistributedInitConfig)
-    ft: Optional[FaultToleranceConfig] = None
-    straggler: Optional[StragglerDetectionConfig] = None
-    nvrx_straggler: Optional[NVRxStragglerDetectionConfig] = None
-    profiling: ProfilingConfig = field(default_factory=ProfilingConfig)
-    peft: Optional[PEFT] = None
-    comm_overlap: Optional[CommOverlapConfig] = None
-    mixed_precision: Optional[Union[MixedPrecisionConfig, str]] = None
-    tensor_inspect: TensorInspectConfig | None = None
-    inprocess_restart: Optional[InProcessRestartConfig] = None
-
-    def get_data_parallel_size(self, world_size: int) -> int:
-        """Calculate the data parallel size based on the model configuration."""
-        model_cfg = self.model
-        total_model_size = (
-            model_cfg.tensor_model_parallel_size
-            * model_cfg.pipeline_model_parallel_size
-            * model_cfg.context_parallel_size
-        )
-        assert world_size % total_model_size == 0, f"""
-        world size ({world_size}) is not divisible by total_model_size ({model_cfg.tensor_model_parallel_size=} * {model_cfg.pipeline_model_parallel_size=} * {model_cfg.context_parallel_size=})
-        """
-        return world_size // total_model_size
-
-    def set_data_parallel_size(self) -> None:
-        """Calculate and set data_parallel_size for this config and comm_overlap config.
-
-        This method calculates the data parallel size needed by setup methods, without
-        triggering full validation or finalization of Megatron Core configs.
-        """
-        # Calculate data parallel size (needed for comm overlap setup)
-        world_size = get_world_size_safe()
-        self.data_parallel_size = self.get_data_parallel_size(world_size)
-
-        # Set data_parallel_size on comm_overlap config if present
-        if self.comm_overlap is not None:
-            self.comm_overlap.data_parallel_size = self.data_parallel_size
-
-    def _validate_and_apply_deterministic_mode(self) -> None:
-        """Apply and validate deterministic mode requirements.
-
-        This enforces restrictions and settings that must hold when
-        the model is configured to run in deterministic mode.
-        """
-        if not getattr(self.model, "deterministic_mode", False):
-            return
-
-        # Disallow flash attention when running deterministically
-        if getattr(self.model, "attention_backend", None) == AttnBackend.flash:
-            raise AssertionError("Flash attention can not be used in deterministic mode.")
-
-        # Disallow cross-entropy loss fusion as it is not deterministic
-        assert not getattr(self.model, "cross_entropy_loss_fusion", False), (
-            "Cross Entropy Fusion is currently not deterministic."
-        )
-
-        all_reduce_choices = ("Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS")
-        assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, (
-            f"NCCL_ALGO must be one of {all_reduce_choices}."
-        )
-
-        # Enable deterministic algorithms in torch
-        torch.use_deterministic_algorithms(True)
-
-    def validate(self) -> None:
-        """Performs validation checks on the combined configuration.
-
-        Calculates dependent values like data_parallel_size and scheduler steps.
-        Ensures compatibility between different configuration settings.
-        """
-
-        # Propagate in-batch packing flag to model config so TransformerConfig.finalize()
-        # can enable variable_seq_lengths for pipeline parallelism.
-        if getattr(self.dataset, "pack_sequences_in_batch", False):
-            self.model._pack_sequences_in_batch = True
-
-        if hasattr(self.dataset, "finalize"):
-            self.dataset.finalize()
-        if hasattr(self.ddp, "finalize"):
-            self.ddp.finalize()
-        if hasattr(self.optimizer, "finalize"):
-            self.optimizer.finalize()
-        if hasattr(self.model, "finalize"):
-            self.model.finalize()
-
-        self.logger.finalize()
-        self.train.finalize()
-        self.scheduler.finalize()
-        self.checkpoint.finalize()
-        if self.profiling is not None:
-            self.profiling.finalize()
-        if self.nvrx_straggler is not None:
-            self.nvrx_straggler.finalize()
-        if self.tensor_inspect is not None:
-            self.tensor_inspect.finalize()
-
-        # Sync config. If TE RNG tracker is set in either ways, set them in both places.
-        if self.rng.te_rng_tracker or self.model.use_te_rng_tracker:
-            self.model.use_te_rng_tracker = self.rng.te_rng_tracker = True
-
-        # Re-run post-inits of sub-configs
-        for f in fields(self):
-            sub_cfg = getattr(self, f.name)
-            if hasattr(sub_cfg, "__post_init__") and not hasattr(sub_cfg, "finalize"):
-                sub_cfg.__post_init__()
-
-        # Distributed - ensure data_parallel_size is calculated (might already be set by set_data_parallel_size)
-        if not hasattr(self, "data_parallel_size") or self.data_parallel_size is None:
-            world_size = get_world_size_safe()
-            self.data_parallel_size = self.get_data_parallel_size(world_size)
-            # Set data_parallel_size on comm_overlap config if present
-            if self.comm_overlap is not None:
-                self.comm_overlap.data_parallel_size = self.data_parallel_size
-
-        # Deterministic mode validations and settings
-        self._validate_and_apply_deterministic_mode()
-
-        # Run validations
-        _validate_and_sync_distributed_optimizer_settings(self)
-        _validate_mixed_precision_consistency(self)
-        _validate_fine_grained_activation_offloading(self)
-
-        # CUDA graph scope validation: check_for_nan_in_loss must be disabled with full_iteration graph
-        if self.model.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in self.model.cuda_graph_scope:
-            assert not self.rerun_state_machine.check_for_nan_in_loss, (
-                "check_for_nan_in_loss must be disabled when using full_iteration CUDA graph. "
-                "Set rerun_state_machine.check_for_nan_in_loss=False."
-            )
-        if self.model.cuda_graph_impl == "none":
-            self.model.cuda_graph_scope = []
-
-        if self.dist.use_megatron_fsdp and self.dist.use_torch_fsdp2:
-            raise ValueError("Using use_megatron_fsdp and use_torch_fsdp2 at the same time is not supported.")
-
-        # Megatron FSDP Config checks
-        if self.dist.use_megatron_fsdp or self.ddp.use_megatron_fsdp:
-            # Set Megatron FSDP Configs
-            self.dist.use_megatron_fsdp = True
-            self.ddp.use_megatron_fsdp = True
-
-            assert not self.dist.use_tp_pp_dp_mapping, "use_tp_pp_dp_mapping is not supported with Megatron FSDP"
-
-            if self.checkpoint.save is not None or self.checkpoint.load is not None:
-                # only check if saving or loading
-                assert self.checkpoint.ckpt_format == "fsdp_dtensor", (
-                    "Megatron FSDP only supports fsdp_dtensor checkpoint format"
-                )
-
-            if self.ddp.average_in_collective and not self.ddp.disable_symmetric_registration:
-                print_rank_0(
-                    "average_in_collective is not supported with NCCL symmetric registration, setting to False"
-                )
-                self.ddp.average_in_collective = False
-
-            # reuse_grad_buf_for_mxfp8_param_ag is not supported with Megatron FSDP
-            if self.ddp.reuse_grad_buf_for_mxfp8_param_ag:
-                print_rank_0("reuse_grad_buf_for_mxfp8_param_ag is not supported with Megatron FSDP, setting to False")
-                self.ddp.reuse_grad_buf_for_mxfp8_param_ag = False
-            if self.optimizer.reuse_grad_buf_for_mxfp8_param_ag:
-                self.optimizer.reuse_grad_buf_for_mxfp8_param_ag = False
-
-        # ModelOpt/Quantization checks
-        if getattr(self.model, "restore_modelopt_state", False):
-            assert not self.model.gradient_accumulation_fusion, (
-                "Gradient accumulation fusion is not supported with ModelOpt/Quantized models. "
-                "Please set model.gradient_accumulation_fusion=False"
-            )
-
-        # Checkpoint
-        if self.checkpoint.save is not None or self.checkpoint.load is not None:
-            # only check if saving or loading
-            if self.checkpoint.ckpt_format == "fsdp_dtensor":
-                assert self.ddp.use_megatron_fsdp and not self.dist.use_torch_fsdp2, (
-                    "fsdp_dtensor checkpoint format only supports Megatron FSDP"
-                )
-
-        # Enforce async_save format restriction
-        if self.checkpoint.async_save:
-            assert self.checkpoint.ckpt_format == "torch_dist", (
-                "async_save is only supported with ckpt_format='torch_dist'"
-            )
-
-        # Set defaults for tensor inspect callback
-        if self.tensor_inspect is not None and self.tensor_inspect.enabled:
-            if self.tensor_inspect.log_dir is None:
-                self.tensor_inspect.log_dir = self.checkpoint.save or "."
-            if self.tensor_inspect.init_training_step == 0 and self.checkpoint.ckpt_step is not None:
-                self.tensor_inspect.init_training_step = int(self.checkpoint.ckpt_step)
-
-        self.model.use_cpu_initialization = self.model.use_cpu_initialization or self.dist.lazy_mpu_init
-
-        # Gloo process groups are not supported when using decentralized process groups (NCCL only).
-        if self.dist.use_decentralized_pg:
-            assert not self.dist.use_gloo_process_groups, (
-                "Gloo process groups are not supported when use_decentralized_pg=True. "
-                "Decentralized process groups only support NCCL backend."
-            )
-
-        # Make sure all functionality that requires Gloo process groups is disabled.
-        if not self.dist.use_gloo_process_groups:
-            if self.optimizer.use_distributed_optimizer:
-                # If using distributed optimizer, must use distributed checkpointing.
-                # Legacy checkpointing uses Gloo process groups to collect full distributed
-                # optimizer state in the CPU memory of DP rank 0.
-                assert self.checkpoint.ckpt_format == "torch_dist"
-
-        # Cross-validation between training and scheduler configs
-        self._validate_training_scheduler_compatibility()
-
-        # Calculate scheduler steps for both iteration-based and sample-based training
-        self._calculate_scheduler_steps()
-
-        if self.model.context_parallel_size > 1:
-            assert self.model.seq_length % (self.model.context_parallel_size * 2) == 0, (
-                "Sequence length must be divisible by 2 * context parallel size if context parallel is used."
-            )
-            if isinstance(self.dataset, FinetuningDatasetConfig):
-                # check calculate_per_token_loss to be True
-                # check average_in_collective to be False
-                # for context parallel to solve the issue of nan loss on ranks with all tokens masked
-                # (only happens in SFT)
-                assert self.model.calculate_per_token_loss, (
-                    "When finetuning with CP>1, calculate_per_token_loss must be True"
-                )
-                assert not self.ddp.average_in_collective, (
-                    "When finetuning with CP>1, average_in_collective must be False"
-                )
-
-        self._validate_cp_comm_type()
-
-        if (
-            isinstance(self.dataset, FinetuningDatasetConfig)
-            and self.dataset.packed_sequence_specs is not None
-            and self.dataset.packed_sequence_specs.packed_sequence_size > 0
-            and self.train.micro_batch_size > 1
-        ):
-            packed_sequence_size = self.dataset.packed_sequence_specs.packed_sequence_size
-            raise ValueError(
-                "Micro batch size should be 1 when training with packed sequence, but your micro batch size "
-                f"is {self.train.micro_batch_size}. \nThe following config is equivalent to your current setting for "
-                f"a packed dataset. Please update your config to the following: \n"
-                f"Set micro batch size to 1 (currently {self.train.micro_batch_size})\n"
-                f"Set global batch size to {self.train.global_batch_size // self.train.micro_batch_size} "
-                f"(currently {self.train.global_batch_size}) \n"
-                f"Set packed sequence length to {packed_sequence_size * self.train.micro_batch_size} "
-                f"(currently {packed_sequence_size}) \n"
-                f"For details please visit "
-                f"https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html"
-            )
-
-        if getattr(self.dataset, "pack_sequences_in_batch", False) and self.train.micro_batch_size == 1:
-            raise ValueError(
-                "micro_batch_size should be greater than 1 when using pack_sequences_in_batch=True. "
-                "In-batch packing concatenates multiple sequences within a microbatch, so at least 2 sequences "
-                "are required per micro-batch."
-            )
-
-        if self.peft is not None:
-            assert self.checkpoint.pretrained_checkpoint is not None, "PEFT requires a pretrained checkpoint path"
-
-        if self.dataset is not None:
-            # Only validate sequence length for GPTDatasetConfig or FinetuningDatasetConfig
-            # DatasetProvider instances may not have sequence_length attributes
-            if isinstance(self.dataset, (GPTDatasetConfig, FinetuningDatasetConfig)):
-                data_seq_length = (
-                    self.dataset.seq_length
-                    if isinstance(self.dataset, FinetuningDatasetConfig)
-                    else self.dataset.seq_length
-                )
-
-                assert self.model.seq_length == data_seq_length, (
-                    f"Please ensure sequence length configuration in model config and "
-                    f"dataset config match.\nSequence length in model config: {self.model.seq_length}, "
-                    f"Sequence length in dataset config: {data_seq_length}"
-                )
-
-        # Validate DeepEP or HybridEP is supported for the current GPU architecture
-        if isinstance(self.model, (GPTModelConfig, MambaModelConfig)):
-            validate_flex_dispatcher_backend(self.model.transformer)
-        else:
-            validate_flex_dispatcher_backend(self.model)
-
-        for f in fields(ValidationConfig):
-            train_val = getattr(self.train, f.name, None)
-            if train_val is not None:
-                warnings.warn(
-                    f"TrainingConfig.{f.name} is deprecated and will be removed in a future release. Use ValidationConfig.{f.name} instead.",
-                    stacklevel=2,
-                )
-                setattr(self.validation, f.name, train_val)
-
-    def _validate_cp_comm_type(self) -> None:
-        """Validate cp_comm_type and hierarchical_context_parallel_sizes consistency."""
-        cp_comm_type = getattr(self.model, "cp_comm_type", None)
-        hcp_sizes = getattr(self.model, "hierarchical_context_parallel_sizes", None)
-        cp_size = getattr(self.model, "context_parallel_size", 1)
-
-        if cp_size > 1 and cp_comm_type is not None:
-            if isinstance(cp_comm_type, list):
-                assert len(cp_comm_type) == self.model.num_layers, (
-                    f"Length of cp_comm_type ({len(cp_comm_type)}) must equal num_layers ({self.model.num_layers})."
-                )
-            else:
-                assert isinstance(cp_comm_type, str), (
-                    f"cp_comm_type must be a str or list of str, got {type(cp_comm_type)}."
-                )
-
-        cp_comm_types = cp_comm_type if isinstance(cp_comm_type, list) else [cp_comm_type or "p2p"]
-        if any("a2a+p2p" in ct for ct in cp_comm_types):
-            assert hcp_sizes is not None, (
-                "hierarchical_context_parallel_sizes must be set when cp_comm_type "
-                "contains 'a2a+p2p'. Without it, CP communication is silently disabled "
-                "and each rank attends only to its local chunk, producing artificially "
-                "high throughput but broken training. Example: for cp=16 across 4 nodes "
-                "of 8 GPUs, set hierarchical_context_parallel_sizes=[8, 2]."
-            )
-
-        if hcp_sizes is not None:
-            from math import prod
-
-            assert prod(hcp_sizes) == cp_size, (
-                f"Product of hierarchical_context_parallel_sizes {hcp_sizes} "
-                f"(={prod(hcp_sizes)}) must equal context_parallel_size (={cp_size})."
-            )
-
-    def _validate_training_scheduler_compatibility(self) -> None:
-        """Cross-validation between training and scheduler configs."""
-        has_train_samples = self.train.train_samples is not None
-
-        if has_train_samples:
-            # Sample-based training validation
-            assert self.scheduler.lr_decay_iters is None, (
-                "Use lr_decay_samples for sample-based training, not lr_decay_iters"
-            )
-            assert self.scheduler.lr_warmup_iters == 0, (
-                "Use lr_warmup_samples for sample-based training, not lr_warmup_iters"
-            )
-            assert not (self.scheduler.lr_warmup_fraction is not None and self.scheduler.lr_warmup_samples != 0), (
-                "Can only specify one of lr_warmup_fraction or lr_warmup_samples"
-            )
-        else:
-            # Iteration-based training validation
-            assert self.scheduler.lr_decay_samples is None, (
-                "Use lr_decay_iters for iteration-based training, not lr_decay_samples"
-            )
-            assert self.scheduler.lr_warmup_samples == 0, (
-                "Use lr_warmup_iters for iteration-based training, not lr_warmup_samples"
-            )
-            assert not (self.scheduler.lr_warmup_fraction is not None and self.scheduler.lr_warmup_iters != 0), (
-                "Can only specify one of lr_warmup_fraction or lr_warmup_iters"
-            )
-
-    def _calculate_scheduler_steps(self) -> None:
-        """Calculate scheduler steps for both iteration-based and sample-based training."""
-        is_sample_based = self.train.train_samples is not None
-
-        if is_sample_based:
-            if self.scheduler.lr_decay_samples is None:
-                self.scheduler.lr_decay_samples = self.train.train_samples
-            self.scheduler.lr_decay_steps = self.scheduler.lr_decay_samples
-            self.scheduler.wd_incr_steps = self.train.train_samples
-
-            if self.scheduler.lr_wsd_decay_samples is not None:
-                self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_samples
-
-            # Warmup calculation for sample-based training
-            if self.scheduler.lr_warmup_fraction is not None:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_fraction * self.scheduler.lr_decay_steps
-            else:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_samples
-        else:
-            # Iteration-based training
-            if self.scheduler.lr_decay_iters is None:
-                self.scheduler.lr_decay_iters = self.train.train_iters
-            if self.scheduler.lr_wsd_decay_iters is None and self.scheduler.lr_decay_style == "WSD":
-                self.scheduler.lr_wsd_decay_iters = self.scheduler.lr_decay_iters
-            self.scheduler.lr_decay_steps = self.scheduler.lr_decay_iters * self.train.global_batch_size
-            self.scheduler.wd_incr_steps = self.train.train_iters * self.train.global_batch_size
-
-            if self.scheduler.lr_wsd_decay_iters is not None:
-                self.scheduler.wsd_decay_steps = self.scheduler.lr_wsd_decay_iters * self.train.global_batch_size
-
-            if self.scheduler.lr_warmup_fraction is not None:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_fraction * self.scheduler.lr_decay_steps
-            else:
-                self.scheduler.lr_warmup_steps = self.scheduler.lr_warmup_iters * self.train.global_batch_size
-
-        # Enforce the Megatron Core invariant: lr_warmup_steps must be < lr_decay_steps.
-        # This can be violated when train_iters is small (e.g. smoke runs) while
-        # lr_warmup_iters is tuned for a full-length training run.
-        if self.scheduler.lr_decay_steps <= 0:
-            raise ValueError(
-                f"lr_decay_steps must be > 0, got {self.scheduler.lr_decay_steps}. "
-                "Please increase train_iters/train_samples or lr_decay_iters/lr_decay_samples."
-            )
-        if self.scheduler.lr_warmup_steps >= self.scheduler.lr_decay_steps:
-            capped = self.scheduler.lr_decay_steps - 1
-            warnings.warn(
-                f"lr_warmup_steps ({self.scheduler.lr_warmup_steps}) >= lr_decay_steps "
-                f"({self.scheduler.lr_decay_steps}); capping lr_warmup_steps to {capped}. "
-                "Reduce lr_warmup_iters (or lr_warmup_samples) for short training runs.",
-                UserWarning,
-                stacklevel=2,
-            )
-            self.scheduler.lr_warmup_steps = capped
-
-    def log_non_default_values(self) -> None:
-        """Log configuration values that differ from Megatron Core defaults.
-
-        For configs that inherit from Megatron Core (e.g., OptimizerConfig, DDPConfig,
-        TransformerConfig), this method logs only the values that differ from the Mcore
-        defaults. This makes it easier to spot unintended deviations from baseline settings.
-
-        For configs that don't inherit from Mcore, key values are logged via
-        `_get_key_config_values`, which excludes None values and callables.
-        """
-        if isinstance(self.model, (GPTModelConfig, MambaModelConfig)):
-            transformer_cfg = self.model.transformer
-        else:
-            transformer_cfg = self.model
-        # Determine the correct Mcore parent class for the model config
-        # Some models (e.g., DeepSeek) use MLATransformerConfig instead of TransformerConfig
-        model_mcore_class = _get_mcore_transformer_parent(transformer_cfg)
-
-        # Map of config names to their (config object, Mcore parent class or None)
-        mcore_configs = [
-            ("optimizer", self.optimizer, MCoreOptimizerConfig),
-            ("ddp", self.ddp, MCoreDistributedDataParallelConfig),
-            ("model", transformer_cfg, model_mcore_class),
-        ]
-
-        # Non-Mcore configs - log all values
-        non_mcore_configs = [
-            ("train", self.train),
-            ("validation", self.validation),
-            ("scheduler", self.scheduler),
-            ("dataset", self.dataset),
-            ("checkpoint", self.checkpoint),
-            ("logger", self.logger),
-            ("tokenizer", self.tokenizer),
-            ("rng", self.rng),
-        ]
-
-        log_lines = [""]
-        log_lines.append("=" * 70)
-        log_lines.append("Configuration Summary (Non-Default Values vs Megatron Core)")
-        log_lines.append("=" * 70)
-
-        # Log non-default values for Mcore configs
-        for config_name, config_obj, mcore_class in mcore_configs:
-            non_defaults = _get_non_default_values(config_obj, mcore_class)
-            if non_defaults:
-                log_lines.append(f"\n[{config_name}] Non-default values (vs Mcore {mcore_class.__name__}):")
-                for field_name, (current_val, default_val) in sorted(non_defaults.items()):
-                    log_lines.append(f"  {field_name}: {current_val!r}  (Mcore default: {default_val!r})")
-
-        # Log key values for non-Mcore configs
-        log_lines.append("\n" + "-" * 70)
-        log_lines.append("Other Configuration Values:")
-        log_lines.append("-" * 70)
-
-        for config_name, config_obj in non_mcore_configs:
-            if config_obj is None:
-                continue
-            key_values = _get_key_config_values(config_obj)
-            if key_values:
-                log_lines.append(f"\n[{config_name}]:")
-                for field_name, value in sorted(key_values.items()):
-                    log_lines.append(f"  {field_name}: {value!r}")
-
-        log_lines.append("\n" + "=" * 70)
-
-        print_rank_0("\n".join(log_lines))
-
-
-def _get_mcore_transformer_parent(model_config: Any) -> type:
-    """Determine the correct Mcore TransformerConfig parent class for a model.
-
-    Some models (e.g., DeepSeek v2/v3) inherit from MLATransformerConfig instead of
-    the base TransformerConfig. This function checks the inheritance chain to find
-    the appropriate Mcore class to use as the baseline for comparison.
-
-    Args:
-        model_config: The model configuration object.
-
-    Returns:
-        The appropriate Mcore TransformerConfig class (MCoreMLATransformerConfig or
-        MCoreTransformerConfig).
-    """
-    # Check if the model inherits from MLATransformerConfig
-    if isinstance(model_config, MCoreMLATransformerConfig):
-        return MCoreMLATransformerConfig
-    return MCoreTransformerConfig
-
-
-def _get_non_default_values(config_obj: Any, mcore_class: type) -> Dict[str, Tuple[Any, Any]]:
-    """Get values that differ from Mcore parent class defaults.
-
-    Args:
-        config_obj: The config object to compare.
-        mcore_class: The Megatron Core parent class to compare against.
-
-    Returns:
-        Dictionary mapping field name to (current_value, default_value) for non-default fields.
-    """
-    non_defaults = {}
-
-    # Get default values from Mcore class
-    mcore_defaults = {}
-    for f in fields(mcore_class):
-        if f.name.startswith("_"):
-            continue
-        if f.default is not MISSING:
-            mcore_defaults[f.name] = f.default
-        elif f.default_factory is not MISSING:
-            mcore_defaults[f.name] = f.default_factory()
-
-    # Compare current values against Mcore defaults
-    for f in fields(config_obj):
-        if f.name.startswith("_"):
-            continue
-        field_name = f.name
-        current_value = getattr(config_obj, field_name, None)
-
-        if field_name in mcore_defaults:
-            default_value = mcore_defaults[field_name]
-            # Skip callable values (like functions) and complex objects
-            if callable(current_value) or callable(default_value):
-                continue
-            # Compare values
-            try:
-                if current_value != default_value:
-                    non_defaults[field_name] = (current_value, default_value)
-            except (TypeError, ValueError):
-                # Some types may not be directly comparable (e.g., torch.dtype)
-                if str(current_value) != str(default_value):
-                    non_defaults[field_name] = (current_value, default_value)
-
-    return non_defaults
-
-
-def _get_key_config_values(config_obj: Any) -> Dict[str, Any]:
-    """Get key configuration values for non-Mcore configs.
-
-    Args:
-        config_obj: The config object to extract values from.
-
-    Returns:
-        Dictionary mapping field name to value for key fields.
-    """
-    values = {}
-    if not hasattr(config_obj, "__dataclass_fields__"):
-        return values
-
-    for f in fields(config_obj):
-        if f.name.startswith("_"):
-            continue
-        value = getattr(config_obj, f.name, None)
-        # Skip None values and complex objects
-        if value is None:
-            continue
-        if callable(value):
-            continue
-        values[f.name] = value
-
-    return values
-
-
-def runtime_config_update(cfg: ConfigContainer) -> None:
-    """Apply runtime configuration updates prior to initialization.
-
-    This function handles all configuration modifications that need to happen
-    after initial config creation but before final validation and model setup.
-
-    Steps:
-    1. Resolve mixed precision configuration from string if needed
-    2. Apply mixed precision settings to model, optimizer, and DDP configs
-    3. Calculate data parallel size (needed for comm overlap)
-    4. Apply communication overlap configuration
-    5. Validate configuration after all modifications
-
-    Args:
-        cfg: Configuration container to update
-    """
-    # Apply mixed precision configuration if provided
-    if cfg.mixed_precision is not None:
-        if isinstance(cfg.mixed_precision, str):
-            cfg.mixed_precision = get_mixed_precision_config(cfg.mixed_precision)
-        cfg.mixed_precision.finalize()
-        cfg.mixed_precision.setup(cfg.model, cfg.optimizer, cfg.ddp)
-
-    # Calculate data parallel size (needed for comm overlap methods)
-    cfg.set_data_parallel_size()
-
-    # Apply communication overlap configuration if provided
-    if cfg.comm_overlap is not None:
-        cfg.comm_overlap.finalize()
-        cfg.comm_overlap.setup(cfg.model, cfg.optimizer, cfg.ddp)
-
-    # Validate configuration after all modifications
-    cfg.validate()
-
-
-def mimo_runtime_config_update(cfg: ConfigContainer) -> None:
-    """MIMO-equivalent of ``runtime_config_update``.
-
-    The standard ``runtime_config_update`` cannot be used directly because it
-    accesses ``cfg.model`` attributes (``bf16``, ``tensor_model_parallel_size``,
-    ``cuda_graph_impl``, …) that do not exist on ``MimoModelProvider``.
-
-    This function cherry-picks the safe, model-agnostic parts:
-
-    Keeps (safe for MIMO):
-    - ``data_parallel_size = 1`` (MIMO-specific hard-code)
-    - Sub-config finalization (optimizer, ddp, logger, train, scheduler, checkpoint)
-    - Distributed optimizer sync validation
-    - Deterministic mode validation
-
-    Skips (would crash or is N/A):
-    - Mixed precision resolution (per-module, not container-level)
-    - Communication overlap setup (not supported for MIMO)
-    - Model-level validations (FSDP, CUDA graphs, TE RNG tracker sync, etc.)
-
-    See ``playground/runtime_config_update_analysis.md`` for the full analysis.
-    """
-    # MIMO: data_parallel_size is always 1 from the training loop's perspective.
-    cfg.data_parallel_size = 1
-
-    # Finalize sub-configs that don't depend on model construction order.
-    # NOTE: cfg.model.finalize() is NOT called here — it validates parallelism
-    # config and is called inside setup_mimo() right before build_infra().
-    if hasattr(cfg.optimizer, "finalize"):
-        cfg.optimizer.finalize()
-    if hasattr(cfg.ddp, "finalize"):
-        cfg.ddp.finalize()
-    cfg.logger.finalize()
-    cfg.train.finalize()
-    cfg.scheduler.finalize()
-    cfg.checkpoint.finalize()
-
-    # Safe validations
-    _validate_and_sync_distributed_optimizer_settings(cfg)
-    cfg._validate_and_apply_deterministic_mode()
-
-
-def _validate_and_sync_distributed_optimizer_settings(config: ConfigContainer) -> None:
-    """Validate and synchronize distributed optimizer settings between DDP and optimizer configs.
-
-    This function ensures that distributed optimizer settings are consistent across
-    DDP and optimizer configurations. If either setting is enabled, both will be
-    enabled to maintain consistency.
-
-    Args:
-        config: The configuration container to validate and potentially modify.
-    """
-    ddp_setting = config.ddp.use_distributed_optimizer
-    optimizer_setting = config.optimizer.use_distributed_optimizer
-
-    if ddp_setting or optimizer_setting:
-        if ddp_setting != optimizer_setting:
-            warn_rank_0(
-                f"Distributed optimizer settings were not in sync: "
-                f"ddp.use_distributed_optimizer={ddp_setting}, "
-                f"optimizer.use_distributed_optimizer={optimizer_setting}. "
-                f"Automatically enabling distributed optimizer for both settings."
-            )
-        config.ddp.use_distributed_optimizer = True
-        config.optimizer.use_distributed_optimizer = True
-
-
-def _validate_mixed_precision_consistency(config: ConfigContainer) -> None:
-    """Validate that mixed precision settings are consistent between model and optimizer configs.
-
-    Args:
-        config: The configuration container to validate.
-
-    Raises:
-        AssertionError: If precision settings are inconsistent in a way that would
-            indicate ambiguous behavior.
-    """
-    model_cfg = config.model
-    optimizer_cfg = config.optimizer
-
-    # Mutually exclusive: cannot have both bf16 and fp16 enabled
-    assert not (model_cfg.bf16 and model_cfg.fp16), (
-        "Model config cannot have both bf16=True and fp16=True. Please set only one precision mode."
-    )
-    assert not (optimizer_cfg.bf16 and optimizer_cfg.fp16), (
-        "Optimizer config cannot have both bf16=True and fp16=True. Please set only one precision mode."
-    )
-
-    # Validate across model and optimizer configs
-    if optimizer_cfg.use_precision_aware_optimizer:
-        # For bf16 training: optimizer.bf16 must match model.bf16
-        if model_cfg.bf16:
-            assert optimizer_cfg.bf16, (
-                "optimizer.bf16=True must be set when model.bf16=True and use_precision_aware_optimizer=True."
-            )
-        # For fp16 training: optimizer.fp16 must match model.fp16
-        if model_cfg.fp16:
-            assert optimizer_cfg.fp16, (
-                "optimizer.fp16=True must be set when model.fp16=True and use_precision_aware_optimizer=True."
-            )
-        # For fp32 training (neither bf16 nor fp16 on model)
-        if not model_cfg.bf16 and not model_cfg.fp16:
-            assert not optimizer_cfg.bf16 and not optimizer_cfg.fp16, (
-                "optimizer.bf16 and optimizer.fp16 must both be False when "
-                "model is using fp32 precision (model.bf16=False, model.fp16=False) and "
-                "use_precision_aware_optimizer=True."
-            )
-
-
-def _validate_fine_grained_activation_offloading(config: ConfigContainer) -> None:
-    """Validate fine-grained activation offloading configuration.
-
-    This function ensures that fine-grained activation offloading is only enabled
-    with compatible configurations (transformer_engine implementation) and that
-    necessary environment variables are set for newer TE versions.
-
-    Args:
-        config: The configuration container to validate.
-
-    Raises:
-        ValueError: If fine-grained activation offloading is enabled with incompatible settings.
-    """
-    from megatron.core.utils import is_te_min_version
-
-    model_cfg = config.model
-
-    if not model_cfg.fine_grained_activation_offloading:
-        return
-
-    # Fine-grained activation offloading requires transformer_engine implementation
-    if model_cfg.transformer_impl != "transformer_engine":
-        raise ValueError(
-            "Fine-grained activation offloading is only supported with transformer_engine implementation. "
-            f"Current transformer_impl: {model_cfg.transformer_impl}"
-        )
-
-    # For TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 must be set to avoid offloading weights
-    if is_te_min_version("2.10.0"):
-        if os.getenv("NVTE_CPU_OFFLOAD_V1", "0") != "1":
-            raise ValueError(
-                "For fine-grained activation offloading with TE >= 2.10.0, "
-                "NVTE_CPU_OFFLOAD_V1 environment variable should be set to 1 to avoid offloading weights."
-            )
-
-```
-
-File: /Users/mromeijn/src/Megatron-Bridge/scripts/training/run_recipe.py
-```py
-#!/usr/bin/env python3
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Generic Training Script for LLM and diffusion models
-
-This script works with any model family that uses GPT-style training
-(Llama, Gemma, Qwen, GPT, etc.) and with diffusion models (e.g. FLUX, WAN). It dynamically loads recipes and supports
-CLI overrides. The --dataset flag selects the dataset type and automatically
-infers pretrain vs finetune mode.
-
-Usage:
-    Pretrain (mock data):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_pretrain_config \\
-            --dataset llm-pretrain-mock
-
-    Pretrain (real data):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_pretrain_config \\
-            --dataset llm-pretrain \\
-            'dataset.blend=[[/data/my_dataset_text_document],null]'
-
-    Finetune (SQuAD, default):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_sft_config \\
-            --dataset llm-finetune
-
-    Finetune (GSM8K):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_sft_config \\
-            --dataset llm-finetune \\
-            dataset.dataset_name=gsm8k
-
-    Finetune (user-supplied JSONL):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_sft_config \\
-            --dataset llm-finetune-preloaded \\
-            dataset.dataset_root=/data/my_finetune_data
-
-    Diffusion pretrain:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \
-            --recipe wan_1_3B_pretrain_config \
-            --step_func wan_step \
-            dataset.path=/data/energon
-
-    Diffusion SFT (full finetuning):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \
-            --recipe wan_1_3B_sft_config \
-            --step_func wan_step
-            dataset.path=/data/energon
-
-    VLM with HF dataset:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe qwen3_vl_8b_peft_config \\
-            --dataset vlm-hf \\
-            --step_func qwen3_vl_step \\
-            dataset.maker_name=cord_v2 \\
-            dataset.hf_processor_path=Qwen/Qwen3-VL-8B-Instruct \\
-            checkpoint.pretrained_checkpoint=/path/to/checkpoint
-
-    VLM with Energon dataset:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe qwen3_vl_8b_peft_energon_config \\
-            --dataset vlm-energon \\
-            --step_func qwen3_vl_step \\
-            dataset.path=/data/energon \\
-            checkpoint.pretrained_checkpoint=/path/to/checkpoint
-
-    VLM with preloaded JSON:
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe qwen3_vl_8b_peft_config \\
-            --dataset vlm-preloaded \\
-            --step_func qwen3_vl_step \\
-            dataset.train_data_path=/data/vlm_train.json \\
-            dataset.image_folder=/data/vlm_images \\
-            dataset.hf_processor_path=Qwen/Qwen3-VL-8B-Instruct \\
-            checkpoint.pretrained_checkpoint=/path/to/checkpoint
-
-    With CLI overrides (Hydra-style, works for any config field):
-        uv run torchrun --nproc_per_node=8 run_recipe.py \\
-            --recipe llama32_1b_pretrain_config \\
-            --dataset llm-pretrain-mock \\
-            train.train_iters=5000 \\
-            optimizer.lr=0.0003
-
-Recipe Arguments:
-    Generic scripts call recipes with no arguments: recipe().
-
-    If you need to pass arguments to the recipe constructor
-    (e.g., custom parallelism at build time), create a custom script.
-"""
-
-import argparse
-import inspect
-from typing import Callable
-
-import megatron.bridge.recipes as recipes
-
-# Diffusion forward steps: use class instances so they can be passed as forward_step_func
-from megatron.bridge.diffusion.models.flux.flux_step import FluxForwardStep
-from megatron.bridge.diffusion.models.wan.wan_step import WanForwardStep
-from megatron.bridge.models.qwen_vl.qwen3_vl_step import forward_step as qwen3_vl_forward_step
-from megatron.bridge.recipes.utils.dataset_utils import (
-    DATASET_TYPES,
-    apply_dataset_override,
-    infer_mode_from_dataset,
-)
-from megatron.bridge.training.audio_lm_step import forward_step as audio_lm_forward_step
-from megatron.bridge.training.config import ConfigContainer
-from megatron.bridge.training.finetune import finetune
-from megatron.bridge.training.gpt_step import forward_step as gpt_forward_step
-from megatron.bridge.training.llava_step import forward_step as llava_forward_step
-from megatron.bridge.training.pretrain import pretrain
-from megatron.bridge.training.utils.omegaconf_utils import process_config_with_overrides
-from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step
-
-
-STEP_FUNCTIONS: dict[str, Callable] = {
-    "audio_lm_step": audio_lm_forward_step,
-    "gpt_step": gpt_forward_step,
-    "vlm_step": vlm_forward_step,
-    "qwen3_vl_step": qwen3_vl_forward_step,
-    "llava_step": llava_forward_step,
-    "flux_step": FluxForwardStep,
-    "wan_step": WanForwardStep,
-}
-
-TRAIN_FUNCTIONS = {
-    "pretrain": pretrain,
-    "finetune": finetune,
-}
-
-ERR_UNKNOWN_STEP = "Unknown step type: {step_type}. Choose from: {choices}"
-ERR_INFER_MODE_FAILED = (
-    "Unable to infer training mode. "
-    "Pass --dataset to specify the dataset type, or include 'pretrain' or 'finetune' "
-    "(or 'sft'/'peft') in the recipe name."
-)
-
-
-def parse_args() -> tuple[argparse.Namespace, list[str]]:
-    """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Generic training script for LLM and diffusion models",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "--recipe",
-        type=str,
-        required=True,
-        help="Recipe function name (e.g., llama32_1b_pretrain_config, gemma3_1b_sft_config, gemma3_1b_peft_config)",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=None,
-        choices=DATASET_TYPES,
-        help=(
-            "Dataset type. Training mode (pretrain/finetune) is inferred from this.\n"
-            "LLM datasets:\n"
-            "  llm-pretrain           GPT pretrain data (set dataset.blend=<path>)\n"
-            "  llm-pretrain-mock      Mock pretrain data for testing\n"
-            "  llm-finetune           HF finetune dataset (set dataset.dataset_name=squad|gsm8k|openmathinstruct2)\n"
-            "  llm-finetune-preloaded User-supplied JSONL (set dataset.dataset_root=<path>)\n"
-            "VLM datasets:\n"
-            "  vlm-energon            Energon multimodal (set dataset.path=<path>)\n"
-            "  vlm-hf                 HF VLM dataset (set dataset.maker_name=<name>)\n"
-            "  vlm-preloaded          User-supplied VLM JSON (set dataset.train_data_path=<path>)"
-        ),
-    )
-    parser.add_argument(
-        "--step_func",
-        type=str,
-        default="gpt_step",
-        choices=sorted(STEP_FUNCTIONS.keys()),
-        help="Step function: gpt_step (text-only), vlm_step (vision-language), llava_step (LLaVA), "
-        "flux_step (FLUX diffusion), wan_step (WAN diffusion, hyperparameters selected by --mode/recipe name)",
-    )
-    parser.add_argument(
-        "--peft_scheme",
-        type=str,
-        default=None,
-        help="PEFT scheme to use: 'lora', 'dora', or None.",
-    )
-    parser.add_argument(
-        "--packed_sequence",
-        action="store_true",
-        default=False,
-        help="Enable packed sequence training (default: False)",
-    )
-    parser.add_argument(
-        "--seq_length",
-        type=int,
-        default=None,
-        help="Sequence length for training",
-    )
-    parser.add_argument(
-        "--hf_path",
-        type=str,
-        default=None,
-        help="HuggingFace model ID or local path to model directory. "
-        "Use a local path for more stable multinode training.",
-    )
-    args, cli_overrides = parser.parse_known_args()
-    return args, cli_overrides
-
-
-def load_recipe(
-    recipe_name: str,
-    peft_scheme: str | None,
-    packed_sequence: bool = False,
-    seq_length: int | None = None,
-    hf_path: str | None = None,
-) -> ConfigContainer:
-    """
-    Load recipe by name from megatron.bridge.recipes.
-
-    Args:
-        recipe_name: Full recipe function name (e.g., 'llama32_1b_pretrain_config')
-        peft_scheme: PEFT scheme to use ('lora', 'dora', or None)
-        packed_sequence: Enable packed sequence training (default: False)
-        seq_length: Sequence length for training (optional)
-        hf_path: HuggingFace model ID or local path to model directory (optional)
-
-    Returns:
-        ConfigContainer from calling the recipe
-
-    Raises:
-        AttributeError: If recipe not found
-    """
-    if not hasattr(recipes, recipe_name):
-        raise AttributeError(
-            f"Recipe '{recipe_name}' not found in megatron.bridge.recipes.\n"
-            f"Make sure the recipe name is correct and the recipe is exported in its family __init__.py.\n"
-            f"Example recipe names: llama32_1b_pretrain_config, gemma3_1b_pretrain_config, qwen3_8b_pretrain_config"
-        )
-
-    config_builder = getattr(recipes, recipe_name)
-
-    # Inspect the recipe's signature to determine which arguments it accepts
-    try:
-        sig = inspect.signature(config_builder)
-        params = sig.parameters
-        has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
-
-        accepts_peft = "peft" in params or has_var_keyword
-        accepts_packed_sequence = "packed_sequence" in params or has_var_keyword
-        accepts_seq_length = "seq_length" in params or has_var_keyword
-        accepts_hf_path = "hf_path" in params or has_var_keyword
-    except (ValueError, TypeError):
-        # If signature inspection fails, fallback conservatively
-        accepts_peft = True  # peft is widely supported, try passing it
-        accepts_packed_sequence = False  # new parameter, don't pass if unsure
-        accepts_seq_length = False  # new parameter, don't pass if unsure
-        accepts_hf_path = False  # model-specific, don't pass if unsure
-
-    # Build kwargs dynamically based on what the recipe accepts
-    kwargs = {}
-    if accepts_peft:
-        kwargs["peft"] = peft_scheme
-    if accepts_packed_sequence and packed_sequence:
-        kwargs["packed_sequence"] = packed_sequence
-    if accepts_seq_length and seq_length is not None:
-        kwargs["seq_length"] = seq_length
-    if accepts_hf_path and hf_path is not None:
-        kwargs["hf_path"] = hf_path
-
-    try:
-        return config_builder(**kwargs)
-    except TypeError:
-        # Fallback if the kwargs are not accepted despite signature inspection
-        return config_builder()
-
-
-def load_forward_step(step_type: str, mode: str | None = None) -> Callable:
-    """Load forward_step function based on the requested step type."""
-    step_key = step_type.lower()
-    if step_key not in STEP_FUNCTIONS:
-        raise ValueError(ERR_UNKNOWN_STEP.format(step_type=step_type, choices=", ".join(STEP_FUNCTIONS)))
-    step = STEP_FUNCTIONS[step_key]
-    if inspect.isclass(step):
-        if "mode" in inspect.signature(step.__init__).parameters:
-            return step(mode=mode)
-        return step()
-    return step
-
-
-def infer_train_mode(recipe_name: str) -> str:
-    """Infer training mode from the recipe name (fallback when --dataset is not passed)."""
-    lowered = recipe_name.lower()
-    has_pretrain = "pretrain" in lowered
-    has_finetune = "finetune" in lowered or "sft" in lowered or "peft" in lowered
-    if has_pretrain ^ has_finetune:
-        return "pretrain" if has_pretrain else "finetune"
-    raise ValueError(ERR_INFER_MODE_FAILED)
-
-
-def main() -> None:
-    """Run GPT training (pretrain or finetune)."""
-    args, cli_overrides = parse_args()
-
-    config: ConfigContainer = load_recipe(
-        args.recipe,
-        args.peft_scheme,
-        args.packed_sequence,
-        args.seq_length,
-        args.hf_path,
-    )
-
-    if args.dataset is not None:
-        mode = infer_mode_from_dataset(args.dataset)
-        config = apply_dataset_override(
-            config,
-            dataset_type=args.dataset,
-            packed_sequence=args.packed_sequence,
-            seq_length=args.seq_length,
-            cli_overrides=cli_overrides,
-        )
-    else:
-        mode = infer_train_mode(args.recipe)
-
-    config = process_config_with_overrides(
-        config,
-        cli_overrides=cli_overrides or None,
-    )
-
-    # Ensure dataset.seq_length and model.seq_length stay in sync after CLI overrides
-    if (
-        hasattr(config, "model")
-        and config.model is not None
-        and hasattr(config, "dataset")
-        and config.dataset is not None
-    ):
-        if hasattr(config.dataset, "seq_length") and config.model.seq_length != config.dataset.seq_length:
-            config.model.seq_length = config.dataset.seq_length
-
-    forward_step = load_forward_step(args.step_func, mode=mode)
-    train_func = TRAIN_FUNCTIONS[mode]
-    train_func(config=config, forward_step_func=forward_step)
-
-
-if __name__ == "__main__":
-    main()
-
-```
-</file_contents>
diff --git a/skills/nemotron-customize/context/mbridge-sft.txt b/skills/nemotron-customize/context/mbridge-sft.txt
deleted file mode 100644
index e443ab48f..000000000
--- a/skills/nemotron-customize/context/mbridge-sft.txt
+++ /dev/null
@@ -1,265 +0,0 @@
-# Megatron-Bridge SFT / PEFT Context
-
-Use this pack when generating stage code for either:
-
-- `sft/megatron_bridge` — full or LoRA SFT on packed Parquet
-- `peft/megatron_bridge` — LoRA-only adapter training that produces a
-  separate `checkpoint_lora` artifact
-
-Both steps share the runner, recipe, and config schema. They differ in (a)
-which artifact they advertise in `step.toml` (`checkpoint_megatron` vs
-`checkpoint_lora`) and (b) how the PEFT block is wired.
-
-## When NOT to use this pack
-
-If the base model is **not Nemotron** and is not in the Megatron-Bridge
-native recipe set (e.g. nemotronh / Nano3 / Super3, llama, qwen, mixtral,
-deepseek, kimi, gpt_oss — see the recipe modules under
-`megatron.bridge.recipes.*`), use AutoModel SFT/PEFT instead. AutoModel is
-the path for HF-format models without an MB recipe. See
-[automodel-pretrain.txt](automodel-pretrain.txt) for the same rule on the
-pretrain side.
-
-## In-repo references (read these first)
-
-- Step manifests:
-  - `src/nemotron/steps/sft/megatron_bridge/step.toml`
-  - `src/nemotron/steps/peft/megatron_bridge/step.toml`
-- Step entry points:
-  - `src/nemotron/steps/sft/megatron_bridge/step.py`
-  - `src/nemotron/steps/peft/megatron_bridge/step.py`
-- Shared runner:    `src/nemotron/steps/_runners/megatron_bridge.py`
-- Default configs:
-  - `src/nemotron/steps/sft/megatron_bridge/config/{default,tiny}.yaml`
-  - `src/nemotron/steps/peft/megatron_bridge/config/{default,tiny}.yaml`
-- Live reference recipe: `src/nemotron/recipes/nano3/stage1_sft/`
-- Companion data prep:   `src/nemotron/steps/prep/sft_packing/` (consume `packed_parquet`)
-
-## Shared runner (the non-obvious part)
-
-`run_megatron_bridge(default_recipe, default_config, entry=finetune, ...)`
-in `_runners/megatron_bridge.py` drives both steps. The flow:
-
-1. Load YAML + apply Hydra-style CLI overrides.
-2. Build the recipe `ConfigContainer` from the `recipe:` block (target +
-   kwargs come from YAML, default falls back to `default_recipe`).
-3. **Dynamically discover override sections** from the ConfigContainer's
-   dataclass fields — new MB sections pick up automatically without
-   hardcoding (`train`, `model`, `checkpoint`, `dataset`, `scheduler`,
-   `logger`, `peft`, ...).
-4. Optionally swap the model provider for an HF-loaded one via `AutoBridge`
-   (when `hf_model_path:` is set).
-5. Optionally apply PEFT via `default_peft_config(peft_scheme=...)` from the
-   `peft:` block.
-6. Translate `dataset:` into a `FinetuningDatasetConfig` (this is what
-   `dataset_mode="finetune"` does for SFT/PEFT — pretrain uses different modes).
-7. Hand off to `entry=finetune` (SFT/PEFT) or `entry=pretrain` (pretrain).
-
-Generated stage code should be a thin wrapper that calls the runner — no
-recipe-instantiation logic in user code.
-
-## Default recipe
-
-Both steps default to:
-
-```text
-megatron.bridge.recipes.nemotronh.nemotron_3_nano.nemotron_3_nano_finetune_config
-```
-
-Override at the YAML level via `recipe._target_:` to use a different MB
-recipe family (e.g. `megatron.bridge.recipes.llama.*`,
-`megatron.bridge.recipes.qwen.*`). Pass recipe-level kwargs alongside
-`_target_` (e.g. `seq_length`, `packed_sequence`, `peft`).
-
-## Step contracts
-
-### `sft/megatron_bridge`
-
-- Consumes: `packed_parquet` (required, from `prep/sft_packing`) + optional
-  base `checkpoint_megatron`.
-- Produces: `checkpoint_megatron`.
-- Models declared in step.toml `[[models]]`:
-  - `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` (default, `min_gpus=8`) — Nano3 30B-A3B hybrid Mamba-Transformer MoE.
-  - `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` (`min_gpus=32`) — Super3 120B-A12B hybrid Mamba LatentMoE.
-- Key params: `seq_length` (default 4096; choices 2048/4096/8192/16384/32768),
-  `peft` (default `lora`; choices `lora` / `null`).
-
-### `peft/megatron_bridge`
-
-- Consumes: `packed_parquet` (required) + base `checkpoint_megatron`
-  (required — the adapter has to attach to a real base).
-- Produces: `checkpoint_lora` (merge with `convert/merge_lora` for an HF
-  checkpoint, or with `convert/megatron_to_hf` first if the base is Megatron).
-- Key params: `peft.type` (`lora` only), `peft.dim` (LoRA rank; default 32).
-
-The shipped sft/megatron_bridge default config also enables LoRA
-(`peft: lora`, `peft.dim: 16`) because full SFT of the 30B MoE doesn't fit
-on the starter 16-GPU topology. To do **full** SFT, set `recipe.peft: null`
-and remove the top-level `peft:` block; this only fits when the model and
-parallelism plan agree.
-
-## YAML schema (what the runner reads)
-
-```yaml
-# Optional: replace the model provider with HF weights via AutoBridge.
-hf_model_path: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
-trust_remote_code: true
-load_hf_weights: true
-
-# Optional: explicit container mounts (used to pin Megatron-LM and
-# Megatron-Bridge revisions when the base image is stale).
-run:
-  env:
-    mounts:
-      - ${auto_mount:git+https://github.com/NVIDIA/Megatron-LM.git@<sha>,/opt/megatron-lm}
-      - ${auto_mount:git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@<branch>,/opt/Megatron-Bridge}
-
-recipe:
-  _target_: megatron.bridge.recipes.nemotronh.nemotron_3_nano.nemotron_3_nano_finetune_config
-  packed_sequence: true
-  peft: lora                # 'lora' or null
-  seq_length: 4096
-  tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 1
-  # EP / ETP are not recipe-level kwargs — set them on `model:` below.
-
-# LoRA adapter spec (omit or set type=null for full SFT).
-peft:
-  type: lora
-  dim: 16
-  alpha: 32
-  dropout: 0.0
-  target_modules:
-    - linear_qkv
-    - linear_proj
-
-dataset:
-  seq_length: 4096
-  packed_sequence_specs:
-    packed_sequence_size: 4096
-    packed_train_data_path: ${oc.env:SFT_PACKED_DIR}
-    # packed_val_data_path: ...   # uncomment when valid/ shards exist
-    # packed_metadata_path: ...
-
-train:
-  train_iters: 1000
-  global_batch_size: 8
-  micro_batch_size: 1
-
-model:
-  seq_length: 4096
-  tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 1
-  context_parallel_size: 1
-  expert_model_parallel_size: 8        # MoE expert parallelism
-  expert_tensor_parallel_size: 1
-  sequence_parallel: true
-  calculate_per_token_loss: true       # required when CP > 1 anywhere
-  recompute_granularity: full
-  recompute_method: uniform
-  recompute_num_layers: 1
-
-scheduler:
-  lr_warmup_iters: 2
-
-logger:
-  log_interval: 1
-  wandb_project: ${oc.env:WANDB_PROJECT,null}
-  wandb_entity: ${oc.env:WANDB_ENTITY,null}
-  wandb_exp_name: ${oc.env:WANDB_NAME,sft-megatron-bridge}
-
-checkpoint:
-  save: ${oc.env:SFT_OUTPUT_DIR}/run-name
-  save_interval: 50
-  finetune: true
-  # PEFT only:
-  pretrained_checkpoint: ${oc.env:NEMOTRON_PRETRAINED_CKPT}
-  # PEFT-friendly defaults — adapter-only checkpoints are sparse, the
-  # async/parallel writers are fragile against that.
-  fully_parallel_save: false
-  async_save: false
-  save_optim: false
-  save_rng: false
-```
-
-## Hard rules (from step.toml [[strategies]] + [[errors]])
-
-- **`prep/sft_packing.pack_size` MUST equal `recipe.seq_length`,
-  `dataset.seq_length`, `dataset.packed_sequence_specs.packed_sequence_size`,
-  and `model.seq_length`** — all five. Mismatch is the most common
-  silent-quality-bug. See `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md`.
-- **Tokenizer + chat template** must match what `prep/sft_packing` used
-  (recovery for `tokenizer_mismatch`).
-- **`dataset.packed_sequence_specs.packed_train_data_path`** points at the
-  `splits/train/*.parquet` directory produced by `prep/sft_packing`, not
-  the run/runs/.../shards path.
-- **For PEFT**, `checkpoint.pretrained_checkpoint` must point at a real
-  Megatron checkpoint dir (not the parent run dir). Don't use
-  `load_hf_weights: true` together with `pretrained_checkpoint` — pick one
-  initialization path.
-- **For full SFT**, set `recipe.peft: null` and **remove** the top-level
-  `peft:` block. Leaving the block in trains LoRA regardless of the recipe
-  flag.
-- **MoE + tensor parallelism** requires `model.sequence_parallel: true`.
-- **CP > 1** anywhere requires `model.calculate_per_token_loss: true`.
-
-## Strategy hints (from sft/megatron_bridge/step.toml)
-
-- `dataset < 10K records` → reduce `train.global_batch_size` and increase
-  `train.train_iters` so optimizer steps remain useful.
-- User wants LoRA → set `recipe.peft: lora` and configure the `peft:` block
-  (or use `peft/megatron_bridge` directly if the output should be a
-  separate adapter artifact).
-- User selects Super3 → start from a 32-GPU plan with `tp=8, pp=4, cp=1`
-  and verify cluster topology before scaling.
-- `seq_length > 32K` → enable hybrid context parallelism (see upstream
-  Megatron-Bridge perf-tuning skills).
-- A100 40GB or memory-tight → enable activation checkpointing
-  (`recompute_granularity: full`) and consider CPU offloading.
-- H100 max-throughput → keep packed sequences on and tune sequence-packing
-  / overlap settings before adding parallelism.
-
-The strategies' `skill:` pointers (e.g. `Megatron-Bridge/skills/perf-techniques/...`)
-live in the upstream repo, not this one. If you can't read them, use the
-`then:` text as guidance and surface a `⚠` in the plan.
-
-## Pipeline placement
-
-```
-prep/sft_packing → sft/megatron_bridge → checkpoint_megatron → eval / RL / convert
-prep/sft_packing → peft/megatron_bridge → checkpoint_lora → convert/merge_lora → eval
-```
-
-For sovereign customizations:
-- `sdg/data_designer (default.yaml)` → `prep/sft_packing` → `sft/megatron_bridge`
-- For target-language SFT, see `src/nemotron/steps/patterns/multilingual-tokenizer-check.md`
-  before choosing tokenizer / pack_size.
-
-## Patterns to cite
-
-- `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md` — the four
-  seq_length fields that must match across the pipeline.
-- `src/nemotron/steps/patterns/sft-sequence-packing.md` — when packing helps
-  vs hurts.
-- `src/nemotron/steps/patterns/sft-data-blending.md` — blend ratios and
-  capability balance for sovereign / multilingual / mixed-source SFT.
-- `src/nemotron/steps/patterns/sft-small-dataset-prefer-lora.md` — when to
-  prefer `peft/megatron_bridge` over full SFT.
-- `src/nemotron/steps/patterns/peft-adapter-merge-discipline.md` — keep the
-  adapter separate until merge is validated.
-- `src/nemotron/steps/patterns/eval-before-and-after-training.md` — bookend
-  every SFT / PEFT run.
-
-## Staleness checks
-
-When updating these steps:
-
-- Refresh recipe target if MB renames `nemotron_3_nano_finetune_config`.
-- Refresh defaults from each `step.toml [[models]]`, `[[parameters]]`, and
-  `config/default.yaml` (especially `peft.dim`, `recipe.tensor_model_parallel_size`,
-  the env-var names like `SFT_PACKED_DIR`, `NEMOTRON_PRETRAINED_CKPT`).
-- Re-verify the section list discovered by `_discover_override_sections`
-  against the live `ConfigContainer`. New sections work automatically; only
-  removed/renamed ones need pack updates.
-- Confirm `packed_sequence_specs` field names match the `PackedSequenceSpecs`
-  dataclass in `megatron.bridge.data.datasets.packed_sequence`.
diff --git a/skills/nemotron-customize/evals/evals.json b/skills/nemotron-customize/evals/evals.json
new file mode 100644
index 000000000..a9cc46f08
--- /dev/null
+++ b/skills/nemotron-customize/evals/evals.json
@@ -0,0 +1,114 @@
+[
+  {
+    "id": "nemotron-customize-translate-llm-command",
+    "question": "In this repo, give me the command to translate /data/news/*.jsonl from English to Hindi with the translate/nemo_curator step. Use text_field=text, output_dir=/data/news_hi, backend=llm, server URL https://integrate.api.nvidia.com/v1, model nvidia/llama-3.3-nemotron-super-49b-v1, and API key env NVIDIA_API_KEY. I only need the command, not a plan.",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should use the existing translate/nemo_curator step and return a complete uv run nemotron steps run translate/nemo_curator command with input_path=/data/news/*.jsonl, output_dir=/data/news_hi, source_language=en, target_language=hi, text_field=text, backend=llm, server.url=https://integrate.api.nvidia.com/v1, server.model=nvidia/llama-3.3-nemotron-super-49b-v1, and server.api_key_env=NVIDIA_API_KEY. It should not generate custom Python, route through BYOB, or omit explicit source and target language codes.",
+    "expected_behavior": [
+      "Read skills/nemotron-customize/SKILL.md before answering.",
+      "Use the step catalog or src/nemotron/steps/translate/nemo_curator/step.toml as the source of truth.",
+      "Return a single runnable translate/nemo_curator command because all required inputs were provided.",
+      "Keep source_language and target_language explicit instead of relying on defaults.",
+      "Do not create a custom translation script when the repo already has a step for this workflow."
+    ]
+  },
+  {
+    "id": "nemotron-customize-lepton-profile-blocked",
+    "question": "Submit sft/automodel on Lepton with -c tiny and batch execution. I do not have an env TOML file in this workspace. Give me the remote command.",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should not invent a Lepton batch profile or emit a remote submission command. It should explain that batch execution requires a reviewed env TOML, usually via NEMOTRON_ENV_FILE, and a concrete profile such as lepton_sft_automodel. It may give the env/env_toml generation command or the local non-batch command, but it should clearly mark the remote command as blocked until the environment file/profile exists.",
+    "expected_behavior": [
+      "Read the skill instructions and environment guidance before answering.",
+      "Identify that Lepton batch execution needs a generated or provided environment TOML.",
+      "Do not guess node groups, mounts, resource shapes, or --batch profile names without an env file.",
+      "Provide the next concrete setup step instead of pretending the remote command is ready.",
+      "Keep the response focused on sft/automodel and do not switch to a different training stack."
+    ]
+  },
+  {
+    "id": "nemotron-customize-byob-translation-routing",
+    "question": "I already generated a BYOB benchmark parquet with multiple-choice questions. I need to translate the benchmark from English to Hindi while preserving the MCQ fields. Which customization workflow should I use and what should the command shape look like?",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should route this to the BYOB workflow, not generic translate/nemo_curator, because the input is a BYOB benchmark with MCQ structure. It should describe using nemotron steps run byob/mcq with the translation stage or translate-specific BYOB config, set source and target languages explicitly, and preserve MCQ schema fields. It should not flatten the benchmark into a single text column unless the user explicitly asks for generic corpus translation.",
+    "expected_behavior": [
+      "Distinguish benchmark translation from generic corpus translation.",
+      "Inspect BYOB-facing references or manifests instead of assuming translate/nemo_curator is always correct.",
+      "Explain that MCQ schema preservation is the reason to use BYOB translation.",
+      "Ask for missing benchmark path or config values if needed before giving an exact command.",
+      "Do not suggest a lossy conversion that drops answer choices or labels."
+    ]
+  },
+  {
+    "id": "nemotron-customize-sft-megatron-bridge-pipeline",
+    "question": "I have OpenAI-style chat JSONL and want to fine-tune a Nemotron checkpoint with Megatron-Bridge. Tell me the correct step sequence and artifacts before you make any code changes.",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should propose data_prep/sft_packing followed by sft/megatron_bridge. It should describe the artifact flow from chat JSONL to packed parquet shards to a Megatron checkpoint, call out that sequence length or packing settings must match the training config, and avoid making code changes because the user asked for the sequence first.",
+    "expected_behavior": [
+      "Read the top-level skill and relevant data_prep and sft references.",
+      "Choose Megatron-Bridge because the user explicitly asked for a Nemotron checkpoint with that stack.",
+      "State the artifact handoff between data preparation and training.",
+      "Mention the configuration values that must be aligned before execution.",
+      "Do not edit files or launch training when the user asked for an explanation first."
+    ]
+  },
+  {
+    "id": "nemotron-customize-automodel-lora-choice",
+    "question": "I only have two GPUs and want a quick LoRA run on a Hugging Face model using OpenAI-style chat JSONL. Which Nemotron customization path should I use?",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should prefer the AutoModel PEFT path, such as peft/automodel, over Megatron-Bridge full SFT. It should explain that AutoModel is the better fit for a small GPU count and Hugging Face model workflow, while Megatron-Bridge is better for larger distributed Nemotron-style training. It should identify the expected input data shape and mention any config values needed before a runnable command can be finalized.",
+    "expected_behavior": [
+      "Map the user's resource constraint and LoRA requirement to peft/automodel.",
+      "Do not choose Megatron-Bridge by default for a two-GPU quick LoRA run.",
+      "Explain the reason for the stack choice in practical terms.",
+      "Call out required inputs such as model id, data path, output directory, and environment profile.",
+      "Avoid inventing paths or secret values."
+    ]
+  },
+  {
+    "id": "nemotron-customize-checkpoint-conversion",
+    "question": "My Megatron training job produced /mnt/lustre-shared/output/sft/megatron_bridge/iter_0001000 and I need a deployable Hugging Face checkpoint under /mnt/lustre-shared/output/sft/hf_export. Which step should I run?",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should use convert/megatron_to_hf and build the command around the concrete iteration checkpoint path and requested Hugging Face export directory. It should mention that the conversion needs the correct source checkpoint layout and model/config information. It should not point the command at the parent training run directory if the step expects the iteration checkpoint.",
+    "expected_behavior": [
+      "Use the conversion workflow instead of retraining or evaluation.",
+      "Select convert/megatron_to_hf, not convert/hf_to_megatron.",
+      "Use the specific iter_0001000 checkpoint as the source in the command shape.",
+      "Use the requested hf_export path as the output destination.",
+      "Identify missing model/config metadata rather than fabricating it."
+    ]
+  },
+  {
+    "id": "nemotron-customize-eval-existing-endpoint",
+    "question": "I have an OpenAI-compatible endpoint for a customized model and want to evaluate it on IFEval and GPQA. I do not want to deploy anything new. What Nemotron step should I use?",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should use eval/model_eval against the existing endpoint. It should include the endpoint URL, model name, API key environment variable, and benchmark selection in the command or config overlay. It should not route through training, deployment, or BYOB.",
+    "expected_behavior": [
+      "Choose eval/model_eval because the user asked to evaluate an existing endpoint.",
+      "Preserve the requirement not to deploy a new model.",
+      "Ask for or include endpoint URL, model name, API key env var, and benchmark names.",
+      "Keep IFEval and GPQA as the selected benchmarks.",
+      "Do not suggest unrelated training or data preparation workflows."
+    ]
+  },
+  {
+    "id": "nemotron-customize-curate-before-translation",
+    "question": "Before translating a local JSONL corpus, I want a light Curator smoke test that reads text from the text field and writes cleaned output. I do not want aggressive domain or language filters yet. Which command shape should I use?",
+    "expected_skill": "nemotron-customize",
+    "expected_script": null,
+    "ground_truth": "The answer should use curate/nemo_curator with local input and output paths, text_field=text, and permissive or disabled filters for the first smoke test. It should not add strict language, domain, quality, or dedup filters unless the user asks for them. It should explain that the smoke test validates IO and schema before tightening filters.",
+    "expected_behavior": [
+      "Route corpus cleaning to curate/nemo_curator instead of translation or training.",
+      "Keep the first run permissive because the user requested a smoke test.",
+      "Require concrete input and output paths before giving a fully runnable command.",
+      "Use text_field=text in the command shape.",
+      "Explain that stricter filtering can be added after IO is validated."
+    ]
+  }
+]
diff --git a/skills/nemotron-customize/act/PROJECT.md b/skills/nemotron-customize/references/act/PROJECT.md
similarity index 90%
rename from skills/nemotron-customize/act/PROJECT.md
rename to skills/nemotron-customize/references/act/PROJECT.md
index eb9854fd1..f62e21983 100644
--- a/skills/nemotron-customize/act/PROJECT.md
+++ b/skills/nemotron-customize/references/act/PROJECT.md
@@ -23,7 +23,7 @@ Per-stage implementations are delegated to sub-agents via [STAGE.md](STAGE.md)
 └── .generated/
     ├── pipeline.toml           # canonical stage graph
     ├── SKILL.md                # invocable as /<project-name>
-    └── plugin.json             # .claude-plugin manifest
+    └── plugin.json             # agent plugin manifest
 ```
 
 **Naming:**
@@ -77,7 +77,7 @@ $DATA_ROOT/
 The filesystem **is** the artifact graph. Document the layout in `README.md`.
 
 The reference recipes under
-[src/nemotron/recipes/](../../../src/nemotron/recipes/) use `${art:...}` for
+[src/nemotron/recipes/](../../../../src/nemotron/recipes/) use `${art:...}` for
 W&B-Artifacts lineage — that's a different system. Don't propagate it into
 generated code.
 
@@ -93,13 +93,13 @@ generated code.
 ```toml
 [[stages]]
 id = "01_translate"
-step = "translate/nemo_skills"
+step = "translate/nemo_curator"
 consumes = "filtered_jsonl"
 produces = "translated_jsonl"
 
 [[stages]]
 id = "02_prep"
-step = "prep/sft_packing"
+step = "data_prep/sft_packing"
 consumes = "translated_jsonl"
 produces = "packed_parquet"
 ```
@@ -116,7 +116,7 @@ STAGES = [s["id"] for s in _pipeline["stages"]]
 ### R5. Generated skill + plugin
 
 `.generated/SKILL.md` + `.generated/plugin.json` make the project invocable as
-`/<project-name>` so the user can run, debug, and iterate via Claude Code.
+`/<project-name>` so the user can run, debug, and iterate via an agent client.
 
 Keep it narrow: "what this pipeline does, how to run each stage, README
 layout." **Don't duplicate `nemotron-customize` content.**
@@ -162,7 +162,7 @@ This pipeline follows the eval-bookends pattern (eval before and after training)
 Packing follows pack-variable-length for heterogeneous SFT data.
 ```
 
-Patterns live at [src/nemotron/steps/patterns/](../../../src/nemotron/steps/patterns/).
+Patterns live at [src/nemotron/steps/patterns/](../../../../src/nemotron/steps/patterns/).
 
 ### R10. Deploy targets share `stages/`
 
@@ -195,7 +195,7 @@ After the scaffold is written, spawn one sub-agent per stage. Each sub-agent:
 You are implementing stage <NN>_<name> = <step_id>.
 
 Load:
-  - skills/nemotron-customize/act/STAGE.md     (implementation contract)
+  - skills/nemotron-customize/references/act/STAGE.md     (implementation contract)
   - <context_pack_path>                         (from context/index.toml lookup)
 
 Plan requirements:
@@ -209,7 +209,7 @@ Deliverables (exactly these, all under output path):
   - run.py
   - __init__.py
   - config/default.yaml
-  - config/tiny.yaml
+  - config/tiny.yaml, or the step's checked-in smoke config name such as config/tiny_chat.yaml for eval/model_eval
 
 Report back: files written, config knobs exposed, any UPSTREAM notes,
 strategies followed (for the plan's traceability log).
@@ -225,5 +225,5 @@ Stages can be generated in parallel — they're independent directories.
 - [ ] Every `consumes`/`produces` chain is consistent.
 - [ ] `pyproject.toml` covers every import in every stage.
 - [ ] `README.md` mermaid matches actual stages.
-- [ ] `tiny.yaml` exists per stage with reduced iterations.
+- [ ] A smoke config exists per stage with reduced scope, using the step's checked-in naming convention.
 - [ ] No `${art:...}` references leaked into generated stage configs.
diff --git a/skills/nemotron-customize/act/STAGE.md b/skills/nemotron-customize/references/act/STAGE.md
similarity index 87%
rename from skills/nemotron-customize/act/STAGE.md
rename to skills/nemotron-customize/references/act/STAGE.md
index 53371e35b..e85edba7e 100644
--- a/skills/nemotron-customize/act/STAGE.md
+++ b/skills/nemotron-customize/references/act/STAGE.md
@@ -23,7 +23,7 @@ config, write the stage files. Thin. Runnable. Agent-legible.
 ├── __init__.py             # re-export only: `from .run import run_<stage_name>`
 └── config/
     ├── default.yaml        # production config
-    └── tiny.yaml           # smoke test (10 iters, small data, tiny seqlen)
+    └── tiny.yaml           # smoke test, or the step's checked-in smoke config name
 ```
 
 Don't create shared project files — the main agent owns those (see
@@ -69,7 +69,7 @@ If a library lacks a clean public API, write the minimal shim and add a
 reimplementation.
 
 The reference implementation for SFT data prep lives in
-[src/nemotron/recipes/nano3/stage1_sft/data_prep.py](../../../src/nemotron/recipes/nano3/stage1_sft/data_prep.py).
+[src/nemotron/recipes/nano3/stage1_sft/data_prep.py](../../../../src/nemotron/recipes/nano3/stage1_sft/data_prep.py).
 Use it as your shape model — same `# /// script [tool.runspec] ///` header
 pattern, same thin-wrapper-around-library-API approach.
 
@@ -144,7 +144,8 @@ sequence_parallel: true
 
 - Directories: lowercase + underscores (`stages/sft/`, not `stages/SFT/`).
 - Public entry: `run_<stage_name>()`.
-- Configs: `default.yaml` and `tiny.yaml`, always.
+- Configs: `default.yaml` and the step's checked-in smoke config name. Most
+  stages use `tiny.yaml`; eval/model_eval uses `tiny_chat.yaml`.
 
 ### Style
 
@@ -172,7 +173,7 @@ sequence_parallel: true
    library's API — read it, adapt, don't copy verbatim.
 2. **Valid imports only.** Every import must reference a real module from the
    step's reference code (`steps/<cat>/<step>/step.py` or one of
-   [steps/_runners/](../../../src/nemotron/steps/_runners/)).
+   [steps/_runners/](../../../../src/nemotron/steps/_runners/)).
 3. **No placeholders, hardcoded paths, or tmpdir.** Every path is a CLI arg
    or DATA_ROOT-relative. Runtime-generated orchestrator configs (e.g. nemo-run
    launch files) go to `$DATA_ROOT/<stage>/configs/`. Don't confuse those with
@@ -232,7 +233,7 @@ def run_prep(
 ```
 
 Keep `tokenizer` and `pack_size` aligned with the downstream training stage —
-see [patterns/prep-data-is-tokenizer-locked.md](../../../src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md) and [patterns/sft-sequence-packing.md](../../../src/nemotron/steps/patterns/sft-sequence-packing.md).
+see [patterns/prep-data-is-tokenizer-locked.md](../../../../src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md) and [patterns/sft-sequence-packing.md](../../../../src/nemotron/steps/patterns/sft-sequence-packing.md).
 
 ---
 
@@ -242,13 +243,13 @@ Multi-GPU training needs a process launcher (torchrun) and lives behind
 nemo-run's `Experiment` + `Script` abstraction. **Don't invent the nemo-run
 API from memory.** The authoritative reference is the in-repo runner:
 
-- [src/nemotron/steps/_runners/megatron_bridge.py](../../../src/nemotron/steps/_runners/megatron_bridge.py) — used by sft/peft/pretrain Megatron-Bridge steps.
-- [src/nemotron/steps/_runners/automodel.py](../../../src/nemotron/steps/_runners/automodel.py) — used by AutoModel steps.
-- [src/nemotron/steps/_runners/nemo_rl.py](../../../src/nemotron/steps/_runners/nemo_rl.py) — used by all NeMo-RL alignment steps.
+- [src/nemotron/steps/_runners/megatron_bridge.py](../../../../src/nemotron/steps/_runners/megatron_bridge.py) — used by sft/peft/pretrain Megatron-Bridge steps.
+- [src/nemotron/steps/_runners/automodel.py](../../../../src/nemotron/steps/_runners/automodel.py) — used by AutoModel steps.
+- [src/nemotron/steps/_runners/nemo_rl.py](../../../../src/nemotron/steps/_runners/nemo_rl.py) — used by all NeMo-RL alignment steps.
 
 Mirror the runner's call shape; don't import recipe modules directly. Use
 `nemotron.kit.recipe_loader.import_recipe_function` with a string target —
-the live [src/nemotron/steps/sft/megatron_bridge/step.py](../../../src/nemotron/steps/sft/megatron_bridge/step.py)
+the live [src/nemotron/steps/sft/megatron_bridge/step.py](../../../../src/nemotron/steps/sft/megatron_bridge/step.py)
 shows the exact pattern.
 
 W&B for training is **not** configured through a nemo-run tracker. It's driven
diff --git a/skills/nemotron-customize/context/README.md b/skills/nemotron-customize/references/context/README.md
similarity index 60%
rename from skills/nemotron-customize/context/README.md
rename to skills/nemotron-customize/references/context/README.md
index c5f3a48f5..cd02b4fe6 100644
--- a/skills/nemotron-customize/context/README.md
+++ b/skills/nemotron-customize/references/context/README.md
@@ -20,22 +20,13 @@ one of:
 | `automodel-*.txt` | NVIDIA-NeMo/Automodel | `$AUTOMODEL_ROOT` |
 | `curator-*.txt` | NVIDIA-NeMo/Curator | `$CURATOR_ROOT` |
 | `eval-*.txt` | NVIDIA-NeMo/Evaluator | `$EVALUATOR_ROOT` |
+| `checkpoint-conversion.txt` | NVIDIA-NeMo/Megatron-Bridge / HF PEFT | `$MBRIDGE_ROOT`, `$HF_HOME` |
 | `nemo-rl-alignment.txt` | NVIDIA-NeMo/RL | (linked via URL) |
-| `speaker-translation-faith.txt` | NVIDIA Speaker | `$SPEAKER_ROOT` |
+| `curator-translation-faith.txt` | NVIDIA-NeMo/Curator | `$CURATOR_ROOT` |
 | `modelopt-optimization.txt` | NVIDIA Model Optimizer | (linked via URL) |
 | `data-designer-sdg.txt` | NVIDIA Data Designer | (linked via URL) |
 | `nemotron-data-prep.txt` | NVIDIA-NeMo/Nemotron (this repo) | `$NEMOTRON_ROOT` |
 
-Original contributor-host absolute paths have been replaced with the env vars
-listed above.
-
-## nv-base findings
-
-Several packs trigger nv-base SECURITY and PII rules because they contain
-upstream library code samples (env-var-based credential setup, install steps
-that mention root, optimizer hyperparameter literals, etc.). These are
-**documentation excerpts**, not executable paths in this repo. The packs are
-read-only docs consumed by the Act phase — no code path here invokes them.
-
-When triaging an nv-base run, file findings under `context/*.txt` separately
-from findings under any `SKILL.md` / `act/*.md`.
+These packs are curated summaries for agent grounding. They are intentionally
+short and should point agents back to the repo step manifest, config, runner,
+and active env TOML instead of duplicating upstream documentation.
diff --git a/skills/nemotron-customize/references/context/automodel-launcher-executor-modes.txt b/skills/nemotron-customize/references/context/automodel-launcher-executor-modes.txt
new file mode 100644
index 000000000..610066e47
--- /dev/null
+++ b/skills/nemotron-customize/references/context/automodel-launcher-executor-modes.txt
@@ -0,0 +1,52 @@
+# AutoModel Launcher And Executor Context
+
+Use this pack only when a user asks how to run an AutoModel SFT/PEFT step on a
+specific execution backend. It is not the source of the training schema; read
+the step config and `src/nemotron/steps/_runners/automodel.py` first.
+
+## Contract
+
+- Prefer the repo-native command:
+  `uv run nemotron steps run sft/automodel -c <config>`.
+- For remote execution, use the active env TOML and choose a real profile. Do
+  not infer `--batch` from examples or naming conventions.
+- Do not generate custom launcher Python when a step config plus env profile can
+  express the run.
+- Keep secrets in environment variables referenced by env TOML or the runtime
+  environment, not in generated YAML.
+
+## Backend Selection
+
+| Situation | Use |
+|---|---|
+| Local wiring smoke test | `-c tiny --dry-run` first, then local run only if hardware is available |
+| Lepton or DGX Cloud submission | `--batch <profile>` from `NEMOTRON_ENV_FILE` or repo-root `env*.toml` |
+| Slurm submission | Slurm env TOML profile with the container, mounts, and env vars already defined |
+| Missing env file | Stop and ask for/generate env TOML; do not invent a batch profile |
+
+## What To Read
+
+1. `src/nemotron/steps/sft/automodel/step.toml` or
+   `src/nemotron/steps/peft/automodel/step.toml`.
+2. The selected `config/tiny.yaml` or `config/default.yaml`.
+3. `src/nemotron/steps/_runners/automodel.py` for the exact command shape.
+4. Active env TOML sections when remote execution is requested.
+
+## Config Rules
+
+- AutoModel consumes chat-format JSONL, not packed Parquet.
+- Keep `model.pretrained_model_name_or_path`, dataset path, tokenizer/chat
+  template assumptions, and output directory explicit.
+- Use `peft=lora` or a LoRA block for adapter tuning; use full SFT only when the
+  user has enough GPU memory and wants a full checkpoint.
+- For adapter output, plan `convert/merge_lora` if the final artifact must be a
+  standalone HF checkpoint.
+
+## Failure Modes
+
+- If `uv run nemotron steps run ... --dry-run` cannot locate the config, use the
+  full config path instead of an alias.
+- If a remote submission lacks mounts for data/checkpoint paths, fix the env
+  profile before running the job.
+- If W&B is enabled in the training config or env, require `WANDB_API_KEY` in
+  the environment.
diff --git a/skills/nemotron-customize/context/automodel-pretrain.txt b/skills/nemotron-customize/references/context/automodel-pretrain.txt
similarity index 96%
rename from skills/nemotron-customize/context/automodel-pretrain.txt
rename to skills/nemotron-customize/references/context/automodel-pretrain.txt
index 408c39c6f..7df92bcb1 100644
--- a/skills/nemotron-customize/context/automodel-pretrain.txt
+++ b/skills/nemotron-customize/references/context/automodel-pretrain.txt
@@ -68,14 +68,14 @@ Don't put model-family-specific logic in the wrapper.
 
 ## Data: bin/idx pretraining shards
 
-The step consumes `binidx` produced by `prep/pretrain_prep` (Megatron-format
+The step consumes `binidx` produced by `data_prep/pretrain_prep` (Megatron-format
 shards plus `blend.json`). The default config wires it through the
 `MegatronPretraining` dataset:
 
 ```yaml
 dataset:
   _target_: nemo_automodel.components.datasets.llm.megatron_dataset.MegatronPretraining
-  paths: ${oc.env:PRETRAIN_BLEND_PATH}    # blend.json path from prep/pretrain_prep
+  paths: ${oc.env:PRETRAIN_BLEND_PATH}    # blend.json path from data_prep/pretrain_prep
   index_mapping_dir: ./index_mapping/train
   tokenizer:
     _target_: nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained
@@ -84,7 +84,7 @@ dataset:
 
 Validation uses a separate `validation_dataset:` block of the same shape.
 
-The tokenizer must match what `prep/pretrain_prep` used — see
+The tokenizer must match what `data_prep/pretrain_prep` used — see
 `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md`.
 
 ## CPT vs from scratch
@@ -100,7 +100,7 @@ Default `model.pretrained_model_name_or_path` in this repo is
 `[[models]]`). Override at CLI:
 
 ```bash
-nemotron step run pretrain/automodel -c default \
+nemotron steps run pretrain/automodel -c default \
   model.pretrained_model_name_or_path=<your-hf-id>
 ```
 
diff --git a/skills/nemotron-customize/references/context/automodel-sft-peft-core.txt b/skills/nemotron-customize/references/context/automodel-sft-peft-core.txt
new file mode 100644
index 000000000..ea6f44eb7
--- /dev/null
+++ b/skills/nemotron-customize/references/context/automodel-sft-peft-core.txt
@@ -0,0 +1,55 @@
+# AutoModel SFT And PEFT Context
+
+Use this pack when configuring `sft/automodel` or `peft/automodel`.
+
+## Product Contract
+
+- AutoModel is the HF-native path. It consumes chat-format `training_jsonl` and
+  produces an HF checkpoint for full SFT or a LoRA adapter for PEFT.
+- Do not feed packed Parquet to AutoModel. Packed Parquet is for
+  Megatron-Bridge SFT/PEFT.
+- Prefer YAML overrides against the existing step configs. Do not write a new
+  training script unless the repo runner cannot express the request.
+
+## When To Pick AutoModel
+
+| User constraint | Decision |
+|---|---|
+| HF checkpoint output is required | Prefer AutoModel |
+| 1-4 GPU iteration or smaller model | Prefer AutoModel |
+| Non-Nemotron or custom HF model | Prefer AutoModel unless a Megatron-Bridge recipe exists |
+| Large distributed Megatron checkpoint output | Prefer Megatron-Bridge |
+| Adapter-only tuning on HF data | `peft/automodel` |
+
+## Required Inputs
+
+- `model.pretrained_model_name_or_path`: HF id or local HF checkpoint path.
+- `dataset.path_or_dataset_id`: chat-format JSONL or dataset id.
+- Output directory for checkpoints/adapters.
+- Tokenizer/chat-template expectations if they differ from the model defaults.
+
+## SFT Rules
+
+- Use full SFT only when memory is sufficient and a full HF checkpoint is the
+  desired artifact.
+- Keep batch size, max sequence length, gradient accumulation, and precision
+  explicit in the config for reproducibility.
+- If the dataset does not already have OpenAI-style `messages`, add a data-prep
+  step before AutoModel rather than changing the trainer.
+
+## PEFT Rules
+
+- Record the exact base model with the adapter; `convert/merge_lora` needs the
+  same base checkpoint and tokenizer.
+- Start with modest LoRA rank and alpha for smoke runs. Raise rank only when
+  the task needs more capacity.
+- Treat adapter eval and merged-checkpoint eval as separate validation points.
+
+## Failure Modes
+
+- `packed_parquet_used_with_automodel`: use source JSONL or switch to
+  `sft/megatron_bridge`.
+- `chat_template_missing`: use a tokenizer with chat-template support or
+  normalize the dataset.
+- `oom`: reduce sequence length/batch size, switch to LoRA, or choose a smaller
+  model.
diff --git a/skills/nemotron-customize/context/byob-benchmark-curator-translation.txt b/skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt
similarity index 92%
rename from skills/nemotron-customize/context/byob-benchmark-curator-translation.txt
rename to skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt
index 74d96fade..e2b910a14 100644
--- a/skills/nemotron-customize/context/byob-benchmark-curator-translation.txt
+++ b/skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt
@@ -11,15 +11,16 @@ families such as GSM8K.
 
 ## Step Contract
 
-- Step id: `byob`
-- CLI: `nemotron byob`
+- Step id: `byob/mcq`
+- CLI: `nemotron steps run byob/mcq`
 - Source package: `src/nemotron/steps/byob/`
+- Step manifest: `src/nemotron/steps/byob/mcq/step.toml`
 - Generic dispatcher: `src/nemotron/steps/byob/scripts/runtime.py`
 - MCQ orchestration: `src/nemotron/steps/byob/runtime/benchmark_families/mcq/pipeline.py`
 - Optional dependency extra: `byob` (`uv sync --extra byob` or `pip install ".[byob]"`)
-- Generation config: `src/nemotron/steps/byob/config/default.yaml`
-- Tiny smoke config: `src/nemotron/steps/byob/config/tiny.yaml`
-- Translation config: `src/nemotron/steps/byob/config/translate.yaml`
+- Generation config: `src/nemotron/steps/byob/mcq/config/default.yaml`
+- Tiny smoke config: `src/nemotron/steps/byob/mcq/config/tiny.yaml`
+- Translation config: `src/nemotron/steps/byob/mcq/config/translate.yaml`
 - Produces: `mcq_benchmark_parquet`
 - Optional translation produces: `translated_mcq_benchmark_parquet`
 
@@ -60,8 +61,7 @@ Final MCQ parquet columns must remain:
 ## Translation Flow
 
 BYOB translation uses Curator experimental translation as the only text translation engine. Import
-translation stages from `nemo_curator.stages.text.experimental.translation`; do not use the old
-`nemo_curator.stages.text.translation` namespace.
+translation stages from `nemo_curator.stages.text.experimental.translation`.
 
 Preserve this division of responsibility:
 
diff --git a/skills/nemotron-customize/references/context/checkpoint-conversion.txt b/skills/nemotron-customize/references/context/checkpoint-conversion.txt
new file mode 100644
index 000000000..bf93adb13
--- /dev/null
+++ b/skills/nemotron-customize/references/context/checkpoint-conversion.txt
@@ -0,0 +1,44 @@
+# Checkpoint Conversion Context
+
+Use this pack for the `convert/*` steps.
+
+## Product Contract
+
+- Conversion is an explicit pipeline stage. Do not silently change downstream
+  steps to consume a different checkpoint layout.
+- Keep source and destination paths separate so a failed conversion cannot
+  corrupt the input checkpoint.
+- Verify tokenizer/config files travel with HF outputs.
+
+## Step Map
+
+| Step | Input | Output | Use when |
+|---|---|---|---|
+| `convert/hf_to_megatron` | `checkpoint_hf` | `checkpoint_megatron` | A Megatron-Bridge consumer needs distributed checkpoint layout |
+| `convert/megatron_to_hf` | `checkpoint_megatron` | `checkpoint_hf` | HF-native eval, deployment, merge, or optimization needs safetensors layout |
+| `convert/merge_lora` | `checkpoint_lora` + `checkpoint_hf` | `checkpoint_hf` | Adapter must become a standalone HF checkpoint |
+
+## Rules
+
+- For Megatron export, point at the concrete `iter_*` checkpoint directory, not
+  only the parent run directory.
+- For HF import, point at a clean HF model directory with config, tokenizer, and
+  weights.
+- For LoRA merge, use the exact base model used during adapter training.
+- Keep `trust_remote_code=true` only when the HF architecture requires it and
+  the source is trusted.
+
+## Pipeline Patterns
+
+- `peft/automodel` -> `convert/merge_lora` for standalone HF output.
+- `sft/megatron_bridge` -> `convert/megatron_to_hf` for HF-native eval or
+  deployment.
+- `sft/automodel` -> `convert/hf_to_megatron` only when a Megatron-only
+  downstream step requires it.
+
+## Failure Modes
+
+- `source_not_clean_hf_checkpoint`: use a real HF model directory, not trainer
+  logs or adapter-only output.
+- `bad_megatron_checkpoint_path`: use the fully written `iter_*` directory.
+- `base_model_mismatch`: merge adapters only with their original base.
diff --git a/skills/nemotron-customize/references/context/curator-data-acquisition.txt b/skills/nemotron-customize/references/context/curator-data-acquisition.txt
new file mode 100644
index 000000000..93346cc83
--- /dev/null
+++ b/skills/nemotron-customize/references/context/curator-data-acquisition.txt
@@ -0,0 +1,49 @@
+# Curator Data Acquisition Context
+
+Use this pack for `curate/nemo_curator` when the user needs to materialize raw
+text before downstream curation, translation, pretraining prep, or SFT prep.
+
+## Product Contract
+
+- The current step is a lightweight text curation wrapper. It reads local JSONL
+  or an optional Hugging Face snapshot, applies configured filters, and writes
+  JSONL.
+- Do not implement a full Common Crawl downloader unless the repo step cannot
+  satisfy the user request and the user approves Explorer-mode code.
+- Keep Curator reader/writer stages as the default I/O path.
+
+## Local JSONL Path
+
+Use this when the user already has files:
+
+- Set `dataset=null`.
+- Set `input_glob` to the JSONL file or shard glob visible inside the runtime.
+- Set `output_dir` to a new directory.
+- Start permissive: `language_codes=[]`, `domains=[]`, `quality_filters={}`.
+- Add filters only after reader/writer output is verified.
+
+## Hugging Face Snapshot Path
+
+Use this when the user names a dataset:
+
+- Set `dataset.repo_id`, `dataset.repo_type`, `dataset.local_dir`, and
+  `allow_patterns` as needed.
+- Point `input_glob` inside `dataset.local_dir`.
+- Ensure `HF_TOKEN` and `HF_HOME` are available in the runtime when needed.
+
+## Operational Rules
+
+- Split one huge JSONL into shards before Curator reads it if memory pressure is
+  expected.
+- For Lepton or other remote runs, make sure input/output paths live on a
+  mounted shared filesystem.
+- Set `ray.num_cpus` in YAML or via env profile when the default CPU count is
+  not enough.
+
+## Failure Modes
+
+- `input_glob_no_matches`: verify the path inside the container, not only on
+  the submit host.
+- `large_file_oom`: shard input before retrying.
+- `empty_or_tiny_output`: disable filters first, then re-enable one gate at a
+  time.
diff --git a/skills/nemotron-customize/references/context/curator-processing-language-quality.txt b/skills/nemotron-customize/references/context/curator-processing-language-quality.txt
new file mode 100644
index 000000000..3301db77c
--- /dev/null
+++ b/skills/nemotron-customize/references/context/curator-processing-language-quality.txt
@@ -0,0 +1,45 @@
+# Curator Processing, Language, And Quality Context
+
+Use this pack for `curate/nemo_curator` when configuring filtering after input
+loading has been verified.
+
+## Product Contract
+
+- Keep this step simple: read JSONL, optionally apply language/domain/word-count
+  gates, write filtered JSONL.
+- Do not add dedup, custom classifiers, or heavy processing unless the current
+  step exposes it or the user approves a new catalog step.
+
+## Filter Controls
+
+| Need | Config |
+|---|---|
+| Preserve all records for smoke test | `language_codes=[]`, `domains=[]`, `quality_filters={}` |
+| Language gating | `language_codes=[...]`, `models.fasttext_langid`, optional `quality_filters.min_langid_score` |
+| Word-count gate | set both `quality_filters.min_words` and `quality_filters.max_words` |
+| Domain gate | set `domains=[...]` and optional `models.hf_cache_dir` |
+
+## Practical Defaults
+
+- Start with a tiny sample and permissive filters.
+- Add one filter family at a time so failures are attributable.
+- Keep `text_field` aligned with the input schema.
+- Record filter thresholds in the generated project config; they materially
+  affect downstream data quality.
+
+## Remote Runtime Notes
+
+- Language and domain models may need cache directories available on the remote
+  filesystem.
+- For CPU-only curation profiles, constrain Ray CPU count instead of relying on
+  all machine CPUs.
+- If output is unexpectedly empty, inspect the intermediate record counts before
+  changing downstream training configs.
+
+## Failure Modes
+
+- `missing_language_model`: disable language filtering or provide the FastText
+  model path.
+- `incomplete_word_filter`: provide both min and max word thresholds or remove
+  both.
+- `empty_or_tiny_output`: relax filters and inspect a few rejected examples.
diff --git a/skills/nemotron-customize/context/curator-translation-faith.txt b/skills/nemotron-customize/references/context/curator-translation-faith.txt
similarity index 89%
rename from skills/nemotron-customize/context/curator-translation-faith.txt
rename to skills/nemotron-customize/references/context/curator-translation-faith.txt
index 64581485f..605c919eb 100644
--- a/skills/nemotron-customize/context/curator-translation-faith.txt
+++ b/skills/nemotron-customize/references/context/curator-translation-faith.txt
@@ -1,6 +1,6 @@
 # NeMo Curator Translation + FAITH Context
 
-Use this context when generating a `translate/translation` stage.
+Use this context when generating a `translate/nemo_curator` stage.
 
 ## Product Contract
 
@@ -13,10 +13,10 @@ Use this context when generating a `translate/translation` stage.
 
 ## Reference Implementation
 
-- Step wrapper: `src/nemotron/steps/translate/translation/step.py`
-- Step config: `src/nemotron/steps/translate/translation/config/default.yaml`
-- CLI command: `nemotron steps translation`
-- Curator stage: `nemo_curator.stages.text.translation.TranslationStage`
+- Step wrapper: `src/nemotron/steps/translate/nemo_curator/step.py`
+- Step config: `src/nemotron/steps/translate/nemo_curator/config/default.yaml`
+- CLI command: `nemotron steps run translate/nemo_curator`
+- Curator stage: `nemo_curator.stages.text.experimental.translation.TranslationStage`
 - Curator I/O: `JsonlReader`, `ParquetReader`, `JsonlWriter`, `ParquetWriter`
 
 ## Configuration Guidance
@@ -28,7 +28,7 @@ Use this context when generating a `translate/translation` stage.
 - `backend=aws` uses Amazon Translate; require AWS credentials in the environment or role and choose `aws.region`.
 - `backend=google` uses Google Cloud Translation; require Google credentials, `google.api_version`, and `google.project_id` for v3.
 - `output_mode=both` is the safest default for generated projects because it keeps translated fields and metadata.
-- `faith_eval.segment_level=true` is preferred for long corpora because scoring follows the same segmentation boundary as translation.
+- FAITH scoring follows the same translated segments produced by the translation stage, then merges scores back onto output records.
 
 ## Questions To Ask Before Generation
 
diff --git a/skills/nemotron-customize/context/data-designer-sdg.txt b/skills/nemotron-customize/references/context/data-designer-sdg.txt
similarity index 94%
rename from skills/nemotron-customize/context/data-designer-sdg.txt
rename to skills/nemotron-customize/references/context/data-designer-sdg.txt
index 2c1ce7b5a..b7fc75445 100644
--- a/skills/nemotron-customize/context/data-designer-sdg.txt
+++ b/skills/nemotron-customize/references/context/data-designer-sdg.txt
@@ -95,7 +95,7 @@ Validation checks for tool-call data (run before SFT):
 Emits `{"prompt", "chosen", "rejected"}`. Flows into:
 
 ```
-sdg/data_designer → prep/rl_prep → rl/nemo_rl/dpo
+sdg/data_designer → data_prep/rl_prep → rl/nemo_rl/dpo
 ```
 
 ## SFT data (`default.yaml`)
@@ -103,11 +103,11 @@ sdg/data_designer → prep/rl_prep → rl/nemo_rl/dpo
 Emits `{"messages": [...]}`. Flows into:
 
 ```
-sdg/data_designer → prep/sft_packing → sft/megatron_bridge   (Megatron-Bridge SFT)
+sdg/data_designer → data_prep/sft_packing → sft/megatron_bridge   (Megatron-Bridge SFT)
 sdg/data_designer →                    sft/automodel          (AutoModel SFT, no packing)
 ```
 
-Use `prep/sft_packing` only for Megatron-Bridge SFT. AutoModel reads JSONL
+Use `data_prep/sft_packing` only for Megatron-Bridge SFT. AutoModel reads JSONL
 directly.
 
 ## Patterns to cite
diff --git a/skills/nemotron-customize/references/context/eval-deploy-formats.txt b/skills/nemotron-customize/references/context/eval-deploy-formats.txt
new file mode 100644
index 000000000..e718f2729
--- /dev/null
+++ b/skills/nemotron-customize/references/context/eval-deploy-formats.txt
@@ -0,0 +1,42 @@
+# Evaluation Deployment Context
+
+Use this pack when `eval/model_eval` needs deployment guidance for hosted
+endpoints or Megatron checkpoints. HF checkpoints require an existing hosted
+endpoint or a conversion path before using the checked-in Megatron deployment
+config.
+
+## Product Contract
+
+- Prefer an existing OpenAI-compatible endpoint when the user provides one.
+- If deployment is part of the eval step, use the checked-in config and env
+  profile. Do not fabricate a deployment service in skill guidance.
+- Keep deployment and evaluation artifacts separate from training outputs.
+
+## Artifact Routing
+
+| Input artifact | Deployment path |
+|---|---|
+| `checkpoint_hf` | Existing hosted endpoint, or convert to a supported deployment format first |
+| `checkpoint_megatron` | `eval/model_eval/config/default.yaml` Megatron deployment path |
+| LoRA adapter | merge or load adapter with base before evaluation, depending on supported serving path |
+| Existing URL | skip deployment and configure evaluator against the URL |
+
+## Endpoint Rules
+
+- Chat/instruction benchmarks need a chat-compatible endpoint.
+- Logprob benchmarks need completions/logprobs support and a matching tokenizer.
+- Keep model name, URL, and API-key env var explicit in config or CLI
+  overrides.
+- Do not print resolved secret values.
+
+## Remote Rules
+
+- For Lepton/Slurm/DGX Cloud, pick the deployment/eval profile from env TOML.
+- Verify mounted checkpoint paths exist inside the runtime container.
+- Use dry-run or a limited benchmark only to validate launch wiring.
+
+## Failure Modes
+
+- `bad_megatron_checkpoint_path`: point at the concrete `iter_*` checkpoint.
+- `endpoint_not_ready`: health-check the service before starting evaluation.
+- `missing_auth`: set the endpoint API key env var in the runtime.
diff --git a/skills/nemotron-customize/references/context/eval-sovereign-benchmarks.txt b/skills/nemotron-customize/references/context/eval-sovereign-benchmarks.txt
new file mode 100644
index 000000000..7e79a0dc8
--- /dev/null
+++ b/skills/nemotron-customize/references/context/eval-sovereign-benchmarks.txt
@@ -0,0 +1,54 @@
+# Evaluation Context: Container-Backed Benchmarks
+
+Use this pack when configuring `eval/model_eval` for NeMo Evaluator Launcher
+tasks that are owned by an evaluator container, including sovereign,
+multilingual, custom-language, standard English, tool, or agent benchmarks.
+
+## Launcher Contract
+
+Evaluator Launcher task entries can include:
+
+- `name`: exact task id from `nemo-evaluator-launcher ls tasks` or `nemo-evaluator-launcher ls task <task_id>`.
+- `container`: evaluation image that owns the task metadata.
+- `endpoint_type`: `chat`, `completions`, or logprob-compatible completions.
+
+The task container is the source of truth for benchmark metadata. Do not
+duplicate every task definition in Nemotron code. Do not construct task names
+as `<harness>.<benchmark>` unless the launcher or task container lists that
+exact dotted id.
+
+## Endpoint Selection
+
+Ask for model id, endpoint URL, API key environment variable name, endpoint
+capability, target language, benchmark container image, and smoke versus full
+run. Use `deployment.type=none` for hosted endpoints.
+
+## Benchmark Selection
+
+Pick tasks by target language and endpoint capability, not by model origin. A
+sovereign or region-specific model can still run standard English benchmarks
+when the user wants English capability measurement.
+
+Standard English smoke task ids:
+
+- `adlr_mmlu` with a completions endpoint.
+- `hellaswag` with a completions endpoint that supports logprobs, plus the evaluated model tokenizer.
+- `mmlu_instruct` with a chat endpoint.
+
+Sovereign/Indic examples:
+
+- `sovereign.gsm8k_indic_hi`
+- `sovereign.mmlu_indic_hi`
+- `sovereign.indicgenbench_flores_in_hi`
+
+Indic language codes include `hi`, `bn`, `gu`, `kn`, `mr`, `ml`, `or`, `pa`,
+`ta`, and `te`. Use `_completions` variants for completions-only endpoints and
+`_logprob` variants only after verifying logprob support.
+
+## Metrics
+
+- GSM8K and chat MCQ tasks: `pass@1.correct`
+- MMLU-style logprob tasks: `acc`
+- ARC/BoolQ logprob tasks: `acc_norm`
+- FLORES translation: `chrf`
+- CrossSum summarization: `rouge_l`
diff --git a/skills/nemotron-customize/references/context/eval-standard-nlu.txt b/skills/nemotron-customize/references/context/eval-standard-nlu.txt
new file mode 100644
index 000000000..1f653919f
--- /dev/null
+++ b/skills/nemotron-customize/references/context/eval-standard-nlu.txt
@@ -0,0 +1,45 @@
+# Standard Evaluation Context
+
+Use this pack when configuring `eval/model_eval` benchmark selection and
+endpoint behavior.
+
+## Product Contract
+
+- Evaluation consumes an already trained/deployed checkpoint or endpoint and
+  produces benchmark results.
+- Do not treat tiny or limited-sample runs as quality evidence. They only prove
+  wiring.
+- Use checked-in step configs and the evaluator runner before inventing a new
+  launcher.
+
+## Benchmark Selection
+
+| Goal | Benchmark shape |
+|---|---|
+| Instruction following | chat endpoint, deterministic generation, tasks like IFEval |
+| Knowledge/reasoning | chat endpoint, larger `max_new_tokens`, model-card decoding defaults |
+| Multiple-choice logprob | completions endpoint with logprobs and tokenizer |
+| Regression smoke | tiny subset or small benchmark list, explicitly marked non-comparable |
+
+## Required Runtime Inputs
+
+- `evaluation.tasks`: concrete NeMo Evaluator Launcher task entries.
+- `target.api_endpoint.url`: OpenAI-compatible endpoint URL when
+  `deployment.type=none`.
+- `target.api_endpoint.type`: `chat` or `completions`.
+- Tokenizer path/model handle for logprob benchmarks.
+- API key environment variable when the endpoint requires auth.
+
+## Rules
+
+- Match endpoint type to benchmark type. Chat tasks should not be forced through
+  logprob completions, and logprob tasks need completions/logprobs support.
+- Keep generation budgets explicit for reasoning tasks.
+- Preserve result directories per run so before/after comparisons do not
+  overwrite each other.
+
+## Failure Modes
+
+- `wrong_endpoint_type`: switch chat/completions to match the task.
+- `missing_tokenizer_for_logprobs`: provide tokenizer path or choose chat tasks.
+- `no_endpoint`: deploy first or provide an existing endpoint URL.
diff --git a/skills/nemotron-customize/context/index.toml b/skills/nemotron-customize/references/context/index.toml
similarity index 81%
rename from skills/nemotron-customize/context/index.toml
rename to skills/nemotron-customize/references/context/index.toml
index 5f1d646b2..7e0dffdf1 100644
--- a/skills/nemotron-customize/context/index.toml
+++ b/skills/nemotron-customize/references/context/index.toml
@@ -11,8 +11,8 @@
 # Don't add new packs unless they survive the "<250 line authored extract" bar.
 
 [[packs]]
-id = "byob-benchmark-generate"
-step = "byob"
+id = "byob-mcq-generate"
+step = "byob/mcq"
 intent = "generate"
 file = "byob-benchmark-curator-translation.txt"
 description = "Generate or translate BYOB MCQ benchmarks — domain corpus inputs, MCQ schema, Curator semantic deduplication, experimental translation, and round-trip metrics."
@@ -45,6 +45,34 @@ intent = "deploy"
 file = "eval-deploy-formats.txt"
 description = "Configure evaluator deployment targets and model formats — NIM, vLLM, Hugging Face, and Megatron-Bridge inputs."
 
+[[packs]]
+id = "eval-model-eval-container"
+step = "eval/model_eval"
+intent = "container"
+file = "eval-sovereign-benchmarks.txt"
+description = "Configure NeMo Evaluator Launcher container-backed tasks — sovereign, multilingual, custom-language, standard English, tool, or agent benchmark containers."
+
+[[packs]]
+id = "convert-hf-to-megatron-generate"
+step = "convert/hf_to_megatron"
+intent = "generate"
+file = "checkpoint-conversion.txt"
+description = "Convert a clean Hugging Face checkpoint to Megatron distributed layout for Megatron-Bridge consumers."
+
+[[packs]]
+id = "convert-megatron-to-hf-generate"
+step = "convert/megatron_to_hf"
+intent = "generate"
+file = "checkpoint-conversion.txt"
+description = "Export a concrete Megatron iter_* checkpoint to Hugging Face safetensors for evaluation, deployment, or merge flows."
+
+[[packs]]
+id = "convert-merge-lora-generate"
+step = "convert/merge_lora"
+intent = "generate"
+file = "checkpoint-conversion.txt"
+description = "Merge a LoRA adapter with its exact HF base model to produce a standalone HF checkpoint."
+
 [[packs]]
 id = "optimize-modelopt-distill-generate"
 step = "optimize/modelopt/distill"
@@ -74,22 +102,22 @@ file = "mbridge-sft.txt"
 description = "Megatron-Bridge PEFT/LoRA — adapter dim/alpha, packed Parquet input, base checkpoint binding, adapter-only checkpoint discipline."
 
 [[packs]]
-id = "prep-generate"
-step = "prep/sft_packing"
+id = "data-prep-generate"
+step = "data_prep/sft_packing"
 intent = "generate"
 file = "nemotron-data-prep.txt"
 description = "SFT packing stage — chat templates, sequence packing, shard sizing, packed Parquet."
 
 [[packs]]
-id = "prep-pretrain-generate"
-step = "prep/pretrain_prep"
+id = "data-prep-pretrain-generate"
+step = "data_prep/pretrain_prep"
 intent = "generate"
 file = "nemotron-data-prep.txt"
 description = "Pretraining data prep — HF/local blends, bin/idx tokenization, split blends, tokenizer lock-in."
 
 [[packs]]
-id = "prep-rl-generate"
-step = "prep/rl_prep"
+id = "data-prep-rl-generate"
+step = "data_prep/rl_prep"
 intent = "generate"
 file = "nemotron-data-prep.txt"
 description = "RL data prep — HF placeholder resolution, sharded JSONL, DPO/RLVR/RLHF schemas."
@@ -165,8 +193,8 @@ file = "mbridge-parallelism-performance.txt"
 description = "Tune Megatron-Bridge parallelism and performance — TP/PP/FSDP, overlap, recomputation, and sequence packing."
 
 [[packs]]
-id = "translate-translation-generate"
-step = "translate/translation"
+id = "translate-curator-generate"
+step = "translate/nemo_curator"
 intent = "generate"
 file = "curator-translation-faith.txt"
 description = "Generate translation stage code with NeMo Curator — JSONL/Parquet I/O, structured fields, and FAITH evaluation."
diff --git a/skills/nemotron-customize/references/context/mbridge-parallelism-performance.txt b/skills/nemotron-customize/references/context/mbridge-parallelism-performance.txt
new file mode 100644
index 000000000..31e72c773
--- /dev/null
+++ b/skills/nemotron-customize/references/context/mbridge-parallelism-performance.txt
@@ -0,0 +1,48 @@
+# Megatron-Bridge Parallelism And Performance Context
+
+Use this pack only after the selected Megatron-Bridge step and config are known.
+It helps tune distributed shape; it does not replace the step config.
+
+## Product Contract
+
+- Start from the checked-in tiny/default config and change only the parallelism
+  knobs required by the user's hardware and model.
+- Keep tokenizer, packed sequence length, model sequence length, and dataset
+  sequence length aligned.
+- Do not tune for throughput before the job launches, loads data, and saves a
+  small checkpoint successfully.
+
+## Core Knobs
+
+| Knob | Use |
+|---|---|
+| tensor parallelism (TP) | shard large matrix ops; world size must divide cleanly |
+| pipeline parallelism (PP) | shard layers across GPUs; useful for very deep models |
+| context parallelism (CP) | long sequence memory relief |
+| expert parallelism (EP) | MoE expert sharding when the recipe supports it |
+| sequence parallelism (SP) | memory reduction commonly paired with TP |
+| activation recomputation | memory relief at compute cost |
+| distributed optimizer/FSDP | optimizer-state and gradient memory relief |
+
+## Tuning Order
+
+1. Validate the artifact path and tiny data first.
+2. Set TP/PP/CP/EP to a legal shape for the model and GPU count.
+3. Keep micro batch size at 1 until memory is proven.
+4. Enable activation recomputation before reducing sequence length.
+5. Increase global batch size only after the data-parallel size is known.
+6. Add communication overlap only after correctness and checkpointing work.
+
+## SFT/PEFT Notes
+
+- Megatron SFT/PEFT consumes packed Parquet from `data_prep/sft_packing`.
+- `seq_length` must match the packing `pack_size`.
+- For adapter jobs, keep base checkpoint path and adapter output path distinct.
+
+## Failure Modes
+
+- `world_size_not_divisible`: adjust nodes, GPUs per node, TP, PP, CP, or EP.
+- `sequence_length_mismatch`: repack data or align model/dataset sequence
+  length.
+- `oom`: lower micro batch, enable recomputation, increase parallelism, or
+  switch to PEFT.
diff --git a/skills/nemotron-customize/references/context/mbridge-pretrain.txt b/skills/nemotron-customize/references/context/mbridge-pretrain.txt
new file mode 100644
index 000000000..b5b654f7a
--- /dev/null
+++ b/skills/nemotron-customize/references/context/mbridge-pretrain.txt
@@ -0,0 +1,47 @@
+# Megatron-Bridge Pretraining Context
+
+Use this pack when configuring `pretrain/megatron_bridge`.
+
+## Product Contract
+
+- This step trains or continued-pretrains with Megatron-Bridge and produces a
+  Megatron distributed checkpoint.
+- It consumes `binidx` data plus a `blend.json` from
+  `data_prep/pretrain_prep`. It does not consume SFT packed Parquet.
+- Prefer YAML overrides on the existing step config. Do not write custom
+  training loops.
+
+## Required Inputs
+
+- `dataset.data_paths`: path to the emitted `blend.json`.
+- `seq_length`: aligned with tokenizer/data-prep assumptions.
+- Checkpoint mode:
+  - `load_hf_weights=true` for continued pretraining from an HF base.
+  - `checkpoint.pretrained_checkpoint` or equivalent recipe checkpoint field
+    when resuming from a Megatron checkpoint.
+- Output checkpoint directory distinct from input data and source checkpoints.
+
+## When To Pick This Step
+
+| Requirement | Decision |
+|---|---|
+| Megatron checkpoint output | Use `pretrain/megatron_bridge` |
+| Very large distributed training | Use `pretrain/megatron_bridge` |
+| Small HF-native smoke or simple CPT | Consider `pretrain/automodel` |
+| Raw text input | Run `data_prep/pretrain_prep` first |
+
+## Configuration Rules
+
+- Keep global batch size divisible by data-parallel size.
+- Start with micro batch size 1 for new hardware/model shapes.
+- Use lower learning rates and shorter runs for domain CPT on small corpora.
+- Keep train/valid/test split paths from the same data-prep output.
+- Do not mix SFT packed data and pretraining bin/idx data.
+
+## Failure Modes
+
+- `missing_blend_json`: run `data_prep/pretrain_prep`.
+- `sequence_length_mismatch`: align data prep, recipe, and model sequence
+  length.
+- `transformer_engine_userbuffer_failure`: set `UB_SKIPMC=1` in the runtime env
+  when CUDA multicast is unavailable.
diff --git a/skills/nemotron-customize/references/context/mbridge-sft.txt b/skills/nemotron-customize/references/context/mbridge-sft.txt
new file mode 100644
index 000000000..aa7737e7a
--- /dev/null
+++ b/skills/nemotron-customize/references/context/mbridge-sft.txt
@@ -0,0 +1,49 @@
+# Megatron-Bridge SFT And PEFT Context
+
+Use this pack for:
+
+- `sft/megatron_bridge`: full or LoRA SFT on packed Parquet.
+- `peft/megatron_bridge`: LoRA adapter training on packed Parquet.
+
+## Product Contract
+
+- These steps consume `packed_parquet` from `data_prep/sft_packing`.
+- `sft/megatron_bridge` produces a Megatron checkpoint.
+- `peft/megatron_bridge` produces a LoRA adapter. Plan conversion/merge when an
+  HF deployment artifact is required.
+- Prefer YAML overrides on existing configs. Do not fork Megatron-Bridge scripts
+  for normal SFT/PEFT.
+
+## Required Wiring
+
+- `dataset.packed_sequence_specs.packed_train_data_path`: training Parquet glob.
+- Validation/test packed paths when the config schedules validation.
+- `seq_length` equal to the data-prep `pack_size`.
+- `checkpoint.pretrained_checkpoint` when adapting a Megatron base.
+- Distinct output directories for base checkpoint, adapter, and final merged
+  artifact.
+
+## Backend Choice
+
+| Need | Step |
+|---|---|
+| Full large-scale SFT with Megatron checkpoint output | `sft/megatron_bridge` |
+| Adapter tuning on Megatron checkpoint | `peft/megatron_bridge` |
+| HF-native checkpoint with JSONL data | `sft/automodel` or `peft/automodel` |
+| Memory is too tight for full SFT | PEFT/LoRA first |
+
+## Config Rules
+
+- Start with micro batch size 1 for new shapes.
+- Keep global batch size divisible by data-parallel size.
+- Use `load_hf_weights=false` when starting from a Megatron checkpoint.
+- For adapter reliability, prefer simple checkpoint saves over async/optimizer
+  saves unless the user explicitly needs them.
+
+## Failure Modes
+
+- `missing_packed_data`: run `data_prep/sft_packing`.
+- `sequence_length_mismatch`: repack data or align `seq_length`.
+- `missing_base_checkpoint`: set the Megatron base checkpoint for PEFT.
+- `oom`: lower micro batch, enable recomputation, increase parallelism, or use
+  LoRA.
diff --git a/skills/nemotron-customize/context/modelopt-optimization.txt b/skills/nemotron-customize/references/context/modelopt-optimization.txt
similarity index 98%
rename from skills/nemotron-customize/context/modelopt-optimization.txt
rename to skills/nemotron-customize/references/context/modelopt-optimization.txt
index 42d50542f..9c5fa4601 100644
--- a/skills/nemotron-customize/context/modelopt-optimization.txt
+++ b/skills/nemotron-customize/references/context/modelopt-optimization.txt
@@ -193,7 +193,7 @@ args:
     - /data/tokenized/domain_text_document
 ```
 
-Use `prep/pretrain_prep` first when data starts as HF/local text.
+Use `data_prep/pretrain_prep` first when data starts as HF/local text.
 `use_mock_data: true` is **plumbing only**, not a quality signal.
 
 Distillation patterns:
@@ -211,7 +211,7 @@ Common chains:
 ```
 sft/automodel        → optimize/modelopt/quantize → eval/model_eval
 sft/automodel        → optimize/modelopt/prune    → optimize/modelopt/distill → eval/model_eval
-prep/pretrain_prep   → optimize/modelopt/distill  → eval/model_eval
+data_prep/pretrain_prep   → optimize/modelopt/distill  → eval/model_eval
 ```
 
 Artifact rules:
diff --git a/skills/nemotron-customize/context/nemo-rl-alignment.txt b/skills/nemotron-customize/references/context/nemo-rl-alignment.txt
similarity index 96%
rename from skills/nemotron-customize/context/nemo-rl-alignment.txt
rename to skills/nemotron-customize/references/context/nemo-rl-alignment.txt
index cc9176aa0..bd1bfeabd 100644
--- a/skills/nemotron-customize/context/nemo-rl-alignment.txt
+++ b/skills/nemotron-customize/references/context/nemo-rl-alignment.txt
@@ -107,7 +107,7 @@ Strategies:
 
 In-repo entry path: `src/nemotron/recipes/super3/stage2_rl/stage3_rlhf`.
 
-## Data prep (use `prep/rl_prep` upstream)
+## Data prep (use `data_prep/rl_prep` upstream)
 
 For DPO, preserve `prompt`, `chosen`, `rejected`; validate non-empty
 chosen/rejected; keep train/validation splits deterministic.
@@ -120,9 +120,9 @@ For RLHF, preserve prompt metadata required by the reward model.
 ## Pipeline placement
 
 ```
-sdg/data_designer  → prep/rl_prep → rl/nemo_rl/dpo
-prep/rl_prep                       → rl/nemo_rl/rlvr
-prep/rl_prep                       → rl/nemo_rl/rlhf
+sdg/data_designer  → data_prep/rl_prep → rl/nemo_rl/dpo
+data_prep/rl_prep                       → rl/nemo_rl/rlvr
+data_prep/rl_prep                       → rl/nemo_rl/rlhf
 ```
 
 ## Artifact rules
diff --git a/skills/nemotron-customize/context/nemotron-data-prep.txt b/skills/nemotron-customize/references/context/nemotron-data-prep.txt
similarity index 79%
rename from skills/nemotron-customize/context/nemotron-data-prep.txt
rename to skills/nemotron-customize/references/context/nemotron-data-prep.txt
index dc3817c42..5c78e7023 100644
--- a/skills/nemotron-customize/context/nemotron-data-prep.txt
+++ b/skills/nemotron-customize/references/context/nemotron-data-prep.txt
@@ -1,10 +1,10 @@
 # Nemotron Data Prep Context
 
-Use this pack when generating stage code for any of the prep steps:
+Use this pack when generating stage code for any of the data_prep steps:
 
-- `prep/sft_packing`     → produces `packed_parquet`
-- `prep/pretrain_prep`   → produces `binidx` + `blend.json`
-- `prep/rl_prep`         → produces `training_jsonl` (sharded)
+- `data_prep/sft_packing`     → produces `packed_parquet`
+- `data_prep/pretrain_prep`   → produces `binidx` + `blend.json`
+- `data_prep/rl_prep`         → produces `training_jsonl` (sharded)
 
 The step family wraps `src/nemotron/data_prep` recipes. Generated stage code
 should be a thin wrapper around the recipe entry point — no schema knowledge
@@ -12,14 +12,14 @@ in Python.
 
 ## In-repo references (read these first)
 
-- Manifests: `src/nemotron/steps/prep/<step>/step.toml`
-- Per-step SKILL: `src/nemotron/steps/prep/<step>/SKILL.md`
-- Category SKILL: `src/nemotron/steps/prep/SKILL.md` (which step + when-to-pack table)
-- Shared helpers: `src/nemotron/steps/prep/_common.py`
+- Manifests: `src/nemotron/steps/data_prep/<step>/step.toml`
+- Per-step SKILL: `src/nemotron/steps/data_prep/<step>/SKILL.md`
+- Category SKILL: `src/nemotron/steps/data_prep/SKILL.md` (which step + when-to-pack table)
+- Shared helpers: `src/nemotron/steps/data_prep/_common.py`
 
-## Shared helpers (`prep/_common.py`)
+## Shared helpers (`data_prep/_common.py`)
 
-Use these in every prep stage wrapper:
+Use these in every data_prep stage wrapper:
 
 - `resolve_blend_path(cfg, *, step_dir, default_name="blend_tiny.json")` —
   resolve blend path from config, falling back to a step-bundled default.
@@ -36,7 +36,7 @@ Order in your `run.py`:
 3. `chdir_to_scratch(...)` only after all paths are resolved.
 4. Call the recipe.
 
-## Shared principles across prep steps
+## Shared principles across data_prep steps
 
 - **Tokenizer-locked outputs.** Repack on tokenizer / template / seq_length
   change. See `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md`.
@@ -48,7 +48,7 @@ Order in your `run.py`:
 - **Receipts near the output.** Manifests / blend.json / split metadata land
   next to the produced shards so downstream stages can validate.
 
-## SFT packing (`prep/sft_packing`)
+## SFT packing (`data_prep/sft_packing`)
 
 Consumes OpenAI chat-format JSONL, emits packed Parquet for
 Megatron-Bridge SFT/PEFT.
@@ -75,7 +75,7 @@ Use this **before**:
 Skip this **before**:
 - `sft/automodel`, `peft/automodel` (read `training_jsonl` directly).
 
-## Pretraining prep (`prep/pretrain_prep`)
+## Pretraining prep (`data_prep/pretrain_prep`)
 
 Consumes curated text (HF datasets or local parquet/jsonl), emits Megatron
 bin/idx shards plus `blend.json`.
@@ -98,7 +98,7 @@ Use this **before**:
 - `pretrain/automodel` (the env var `PRETRAIN_BLEND_PATH` points at the produced `blend.json`)
 - `optimize/modelopt/distill` (real-data runs, not `use_mock_data`)
 
-## RL prep (`prep/rl_prep`)
+## RL prep (`data_prep/rl_prep`)
 
 Consumes a blend referencing HF or local prompt/preference datasets, emits
 sharded JSONL ready for `rl/nemo_rl/{dpo,rlvr,rlhf}`.
@@ -124,25 +124,25 @@ Schema preserved per algorithm:
 ## Pipeline placement
 
 ```
-curate/nemo_curator → prep/pretrain_prep → pretrain/{megatron_bridge,automodel}
-curate/nemo_curator → translate/nemo_skills → prep/sft_packing → sft/megatron_bridge
+curate/nemo_curator → data_prep/pretrain_prep → pretrain/{megatron_bridge,automodel}
+curate/nemo_curator → translate/nemo_curator → data_prep/sft_packing → sft/megatron_bridge
                                               ↓
                                           (skip packing) → sft/automodel
-sdg/data_designer  → prep/sft_packing → sft/megatron_bridge
-sdg/data_designer  → prep/rl_prep     → rl/nemo_rl/dpo
-prep/pretrain_prep → optimize/modelopt/distill
+sdg/data_designer       → data_prep/sft_packing → sft/megatron_bridge
+sdg/data_designer       → data_prep/rl_prep     → rl/nemo_rl/dpo
+data_prep/pretrain_prep → optimize/modelopt/distill
 ```
 
 ## Verification
 
 ```bash
-uv run pytest tests/steps/prep -q     # focused
+uv run pytest tests/steps/data_prep -q     # focused
 uv run pytest tests/steps -q          # full step-family suite
 ```
 
 ## Staleness checks
 
-When updating prep steps:
+When updating data_prep steps:
 
 - Verify downstream artifact type still matches (`packed_parquet`, `binidx`, `training_jsonl`).
 - Verify output path is resolved **before** the scratch chdir.
diff --git a/skills/nemotron-customize/examples/CALIBRATION.md b/skills/nemotron-customize/references/examples/CALIBRATION.md
similarity index 80%
rename from skills/nemotron-customize/examples/CALIBRATION.md
rename to skills/nemotron-customize/references/examples/CALIBRATION.md
index 744b5f1f5..e4764b7dd 100644
--- a/skills/nemotron-customize/examples/CALIBRATION.md
+++ b/skills/nemotron-customize/references/examples/CALIBRATION.md
@@ -16,7 +16,7 @@ A: Two options for Nano3 SFT:
 | | Megatron-Bridge | AutoModel |
 |---|---|---|
 | Min GPUs | 8 H100 | 4 |
-| Data format | Packed Parquet (needs prep step) | JSONL directly |
+| Data format | Packed Parquet (needs data_prep step) | JSONL directly |
 | LoRA support | Config flag | Built-in |
 
 Which fits your setup? And how many GPUs do you have?
@@ -24,7 +24,7 @@ Which fits your setup? And how many GPUs do you have?
 user: I have 8 H100s, let's use Megatron-Bridge
 
 A: Got it. I'll build a 3-stage pipeline:
-1. prep/sft_packing → pack your JSONL to Parquet
+1. data_prep/sft_packing → pack your JSONL to Parquet
 2. sft/megatron_bridge → Nano3 SFT (tp=4, pp=1, cp=2)
 3. eval/model_eval → benchmark the result
 
@@ -49,17 +49,17 @@ Here's the plan:
 # Pipeline Plan: thai-nano3-sft-rl
 ## Stages
 1. curate/nemo_curator   → filtered_jsonl
-2. translate/translation → translated_jsonl
-3. prep/sft_packing      → packed_parquet
+2. translate/nemo_curator → translated_jsonl
+3. data_prep/sft_packing → packed_parquet
 4. sft/megatron_bridge   → checkpoint_megatron
-5. rl/nemo_rl_grpo       → checkpoint_megatron (RL-tuned)
+5. rl/nemo_rl/rlvr       → checkpoint_megatron (RL-tuned)
 6. eval/model_eval       → eval_results
 
 ## Validation
 ✓ All artifact types chain
-✓ Tokenizer consistent (Nano3 across prep + SFT + RL)
-⚠ RL needs 16+ GPUs (2 nodes) — do you have that?
-⚠ translate needs an LLM endpoint (NIM or vLLM) — do you have one?
+✓ Tokenizer consistent (Nano3 across data_prep + SFT + RL)
+WARNING: RL needs 16+ GPUs (2 nodes) — do you have that?
+WARNING: translate needs an LLM endpoint (NIM or vLLM) — do you have one?
 
 Approve this plan, or want changes?
 ```
diff --git a/skills/nemotron-nano3/context/quick-reference.md b/skills/nemotron-nano3/context/quick-reference.md
index 26b525a76..3eb6f5c26 100644
--- a/skills/nemotron-nano3/context/quick-reference.md
+++ b/skills/nemotron-nano3/context/quick-reference.md
@@ -236,10 +236,10 @@ So if asked “can I match the paper numbers?” the safe answer is:
 | Need | Step / mode |
 |---|---|
 | curate or filter new corpus | `curate/nemo_curator` |
-| pack Nano3 SFT JSONL | `prep/sft_packing` |
+| pack Nano3 SFT JSONL | `data_prep/sft_packing` |
 | SFT with Megatron-Bridge | `sft/megatron_bridge` |
 | SFT with smaller GPU counts / LoRA | `sft/automodel` |
-| RL with GRPO | `rl/nemo_rl_grpo` *(present in the step catalog as planned; ground it on `src/nemotron/recipes/nano3/stage2_rl/`)* |
+| RL with GRPO | `rl/nemo_rl/rlvr` |
 | evaluate model | `eval/model_eval` |
 | convert HF → Megatron | `convert/hf_to_megatron` |
 | convert Megatron → HF | `convert/megatron_to_hf` |
@@ -248,7 +248,7 @@ So if asked “can I match the paper numbers?” the safe answer is:
 
 There is **currently no public catalog step** for Nano3 pretraining in `src/nemotron/steps/STEPS.md`.
 
-Also note that `rl/nemo_rl_grpo` is listed in the step catalog but its manifest marks it as **planned**, so RL builds should still be grounded carefully on the stage2 recipe rather than treated as a fully mature catalog path.
+For Nano3 RL builds, use `rl/nemo_rl/rlvr` and ground recipe-specific settings on `src/nemotron/recipes/nano3/stage2_rl/`.
 
 If the user asks to build stage0 pretraining via `/nemotron-customize`:
 
@@ -258,8 +258,7 @@ If the user asks to build stage0 pretraining via `/nemotron-customize`:
 
 If the user asks for Nano3 RL generation via `/nemotron-customize`:
 
-- mention `rl/nemo_rl_grpo` as the closest step surface
-- note that it is still marked **planned**
+- mention `rl/nemo_rl/rlvr` as the catalog step surface
 - ground specifics on `src/nemotron/recipes/nano3/stage2_rl/`
 
 ## 19. Quick Answer Templates
diff --git a/skills/nemotron-nano3/paper/_overview.md b/skills/nemotron-nano3/paper/_overview.md
index eb631675d..f51fc7c6b 100644
--- a/skills/nemotron-nano3/paper/_overview.md
+++ b/skills/nemotron-nano3/paper/_overview.md
@@ -14,7 +14,7 @@ key_facts:
   - "The release bundle includes Base BF16, post-trained BF16, FP8 model weights, a GenRM, recipe code, and most of the newly added data collections."
 related_steps:
   - "sft/megatron_bridge"
-  - "rl/nemo_rl_grpo"
+  - "rl/nemo_rl/rlvr"
   - "eval/model_eval"
   - "convert/hf_to_megatron"
 currency: "frozen"
diff --git a/skills/nemotron-nano3/paper/architecture.md b/skills/nemotron-nano3/paper/architecture.md
index b8477bcc2..6d193bee6 100644
--- a/skills/nemotron-nano3/paper/architecture.md
+++ b/skills/nemotron-nano3/paper/architecture.md
@@ -14,7 +14,7 @@ key_facts:
   - "Long-context extension is done with a separate continuous-pretraining phase using 8-way context, tensor, and expert parallelism plus 4-way pipeline parallelism."
 related_steps:
   - "sft/megatron_bridge"
-  - "rl/nemo_rl_grpo"
+  - "rl/nemo_rl/rlvr"
   - "eval/model_eval"
   - "convert/hf_to_megatron"
 currency: "frozen"
diff --git a/skills/nemotron-nano3/paper/data.md b/skills/nemotron-nano3/paper/data.md
index cb7e6ad7d..0bbb744a2 100644
--- a/skills/nemotron-nano3/paper/data.md
+++ b/skills/nemotron-nano3/paper/data.md
@@ -14,8 +14,8 @@ key_facts:
   - "The SFT mixture spans competition math/code, tool use, long-context data, Lean proofs, multilingual data, terminal use, general chat, instruction following, safety, SWE, science, GenSelect, and CUDA pairs."
 related_steps:
   - "curate/nemo_curator"
-  - "prep/sft_packing"
-  - "translate/nemo_skills"
+  - "data_prep/sft_packing"
+  - "translate/nemo_curator"
   - "sft/megatron_bridge"
 currency: "frozen"
 ---
diff --git a/skills/nemotron-nano3/paper/evaluation.md b/skills/nemotron-nano3/paper/evaluation.md
index 30a916f80..375ad45dd 100644
--- a/skills/nemotron-nano3/paper/evaluation.md
+++ b/skills/nemotron-nano3/paper/evaluation.md
@@ -16,7 +16,7 @@ related_steps:
   - "eval/model_eval"
   - "convert/megatron_to_hf"
   - "convert/hf_to_megatron"
-  - "rl/nemo_rl_grpo"
+  - "rl/nemo_rl/rlvr"
 currency: "frozen"
 ---
 
diff --git a/skills/nemotron-nano3/paper/rl.md b/skills/nemotron-nano3/paper/rl.md
index 9942609fc..e13fcfbf9 100644
--- a/skills/nemotron-nano3/paper/rl.md
+++ b/skills/nemotron-nano3/paper/rl.md
@@ -13,7 +13,7 @@ key_facts:
   - "The paper says RLVR can match or surpass a heavily fine-tuned SFT checkpoint."
   - "The RLHF stage uses a GenRM plus group-relative length control and reports roughly 30% lower verbosity without sacrificing accuracy."
 related_steps:
-  - "rl/nemo_rl_grpo"
+  - "rl/nemo_rl/rlvr"
   - "eval/model_eval"
   - "convert/megatron_to_hf"
   - "sft/megatron_bridge"
diff --git a/skills/nemotron-nano3/paper/safety.md b/skills/nemotron-nano3/paper/safety.md
index 100e8cfcf..a2bafdaa3 100644
--- a/skills/nemotron-nano3/paper/safety.md
+++ b/skills/nemotron-nano3/paper/safety.md
@@ -13,7 +13,7 @@ key_facts:
   - "The DPO run uses learning rate 3e-6, batch size 128, and 50 training steps."
   - "In the DPO appendix, hallucinated tool usage drops from 1.25% to 0% on AIME25 and from 8.33% to 0.7% on GPQA."
 related_steps:
-  - "rl/nemo_rl_grpo"
+  - "rl/nemo_rl/rlvr"
   - "eval/model_eval"
   - "sft/megatron_bridge"
 currency: "frozen"
diff --git a/skills/nemotron-nano3/paper/sft.md b/skills/nemotron-nano3/paper/sft.md
index 5f3d14624..914f79275 100644
--- a/skills/nemotron-nano3/paper/sft.md
+++ b/skills/nemotron-nano3/paper/sft.md
@@ -13,7 +13,7 @@ key_facts:
   - "The SFT run trains for 13,000 steps with batch size 64 and sequence packing to 256k."
   - "The SFT domains include competition math/code, tool use, long context, formal proofs, multilingual, terminal use, safety, software engineering, and science."
 related_steps:
-  - "prep/sft_packing"
+  - "data_prep/sft_packing"
   - "sft/megatron_bridge"
   - "sft/automodel"
   - "convert/hf_to_megatron"
diff --git a/skills/nemotron-nano3/recipes/overview.md b/skills/nemotron-nano3/recipes/overview.md
index 46a2824d1..b878b13b8 100644
--- a/skills/nemotron-nano3/recipes/overview.md
+++ b/skills/nemotron-nano3/recipes/overview.md
@@ -99,10 +99,10 @@ Use this map when the user shifts from “what is Nano3?” to “help me build
 | Goal | `nemotron-customize` path |
 |---|---|
 | curate text corpora | `curate/nemo_curator` |
-| pack SFT JSONL for Megatron | `prep/sft_packing` |
+| pack SFT JSONL for Megatron | `data_prep/sft_packing` |
 | run Nano3-style Megatron SFT | `sft/megatron_bridge` |
 | run smaller-GPU SFT | `sft/automodel` |
-| run GRPO alignment | `rl/nemo_rl_grpo` *(listed in the step catalog as planned; ground it on the stage2 recipe)* |
+| run GRPO alignment | `rl/nemo_rl/rlvr` |
 | benchmark a checkpoint | `eval/model_eval` |
 | convert released HF weights to Megatron | `convert/hf_to_megatron` |
 | export Megatron checkpoint back to HF | `convert/megatron_to_hf` |
@@ -116,12 +116,10 @@ So when a user wants to build Nano3-like pretraining with `/nemotron-customize`:
 - ground on `src/nemotron/recipes/nano3/stage0_pretrain/`
 - optionally combine with `curate/nemo_curator` for upstream corpus work
 
-For RL, `rl/nemo_rl_grpo` does appear in the step catalog, but its manifest is marked **planned**.
-So RL requests should still be handled with recipe-grounded caution:
+For RL, use `rl/nemo_rl/rlvr` as the catalog step and keep Nano3-specific details grounded in the recipe:
 
-- mention `rl/nemo_rl_grpo` as the nearest step surface
+- mention `rl/nemo_rl/rlvr` as the step surface
 - ground details on `src/nemotron/recipes/nano3/stage2_rl/`
-- avoid implying that the RL catalog path is as mature as the SFT/eval paths
 
 ## Recommended Answer Pattern
 
diff --git a/skills/nemotron-nano3/recipes/stage1_sft.md b/skills/nemotron-nano3/recipes/stage1_sft.md
index 8039f5f37..f32c33bc9 100644
--- a/skills/nemotron-nano3/recipes/stage1_sft.md
+++ b/skills/nemotron-nano3/recipes/stage1_sft.md
@@ -117,7 +117,7 @@ Even with that scale gap, stage1 is the best public explanation of how Nano3 SFT
 
 This stage maps cleanly to catalog steps:
 
-1. `prep/sft_packing`
+1. `data_prep/sft_packing`
 2. `sft/megatron_bridge`
 
 Optional surrounding steps:
@@ -127,4 +127,4 @@ Optional surrounding steps:
 
 ## Good Handoff Pattern
 
-> “For a public Nano3-style SFT build, use `prep/sft_packing` to produce packed Parquet and `sft/megatron_bridge` to run Megatron-Bridge fine-tuning. That reproduces the stage shape, but not the paper’s full 18M-sample, 256k packed run.”
+> “For a public Nano3-style SFT build, use `data_prep/sft_packing` to produce packed Parquet and `sft/megatron_bridge` to run Megatron-Bridge fine-tuning. That reproduces the stage shape, but not the paper’s full 18M-sample, 256k packed run.”
diff --git a/skills/nemotron-nano3/recipes/stage2_rl.md b/skills/nemotron-nano3/recipes/stage2_rl.md
index 6f191ef26..089172879 100644
--- a/skills/nemotron-nano3/recipes/stage2_rl.md
+++ b/skills/nemotron-nano3/recipes/stage2_rl.md
@@ -117,13 +117,12 @@ The repo gives you the GRPO/RLVR backbone directly, but not a separate one-click
 
 The closest catalog step surface is:
 
-- `rl/nemo_rl_grpo`
+- `rl/nemo_rl/rlvr`
 
-Important maturity note:
+Important grounding note:
 
-- the step exists in `src/nemotron/steps/` and is listed in `STEPS.md`
-- its manifest marks it as **planned**
-- so generation/build work should still be grounded on `src/nemotron/recipes/nano3/stage2_rl/`
+- use the catalog step for GRPO/RLVR wiring
+- ground Nano3-specific data and config details on `src/nemotron/recipes/nano3/stage2_rl/`
 
 Common surrounding steps:
 
@@ -133,4 +132,4 @@ Common surrounding steps:
 
 ## Good Handoff Pattern
 
-> “For the public Nano3 RL path, the nearest `nemotron-customize` surface is `rl/nemo_rl_grpo`, but it is still marked planned. I would ground the build on `src/nemotron/recipes/nano3/stage2_rl/` and use the step only as the conceptual catalog bridge.”
+> “For the public Nano3 RL path, use `rl/nemo_rl/rlvr` as the `nemotron-customize` surface and ground the concrete config details on `src/nemotron/recipes/nano3/stage2_rl/`.”
diff --git a/src/nemo_runspec/data_mover.py b/src/nemo_runspec/data_mover.py
index 6f8506efd..da08cdd59 100644
--- a/src/nemo_runspec/data_mover.py
+++ b/src/nemo_runspec/data_mover.py
@@ -52,7 +52,27 @@
         "node_modules",
     }
 )
-_EXCLUDE_SUFFIXES = (".pyc", ".pyo", ".pyd")
+_EXCLUDE_SUFFIXES = (
+    ".pyc",
+    ".pyo",
+    ".pyd",
+    # Common data/model artifacts should not ride along in the source tarball.
+    ".parquet",
+    ".arrow",
+    ".bin",
+    ".idx",
+    ".npy",
+    ".npz",
+    ".pt",
+    ".pth",
+    ".safetensors",
+    ".ckpt",
+    ".onnx",
+    ".h5",
+    ".hdf5",
+)
+_SCOPED_COLLECTIONS = frozenset({"recipes", "steps"})
+_DEFAULT_TARBALL_WARN_BYTES = 1_000_000
 
 
 def _tar_filter(info):
@@ -62,99 +82,135 @@ def _tar_filter(info):
     return info
 
 
+def _tarball_warn_bytes() -> int:
+    raw = os.environ.get("NEMOTRON_SRC_TARBALL_WARN_BYTES")
+    if raw is None:
+        return _DEFAULT_TARBALL_WARN_BYTES
+    try:
+        return max(0, int(raw))
+    except ValueError:
+        return _DEFAULT_TARBALL_WARN_BYTES
+
+
+def _format_bytes(size: int) -> str:
+    if size < 1024 * 1024:
+        return f"{size / 1024:.1f} KiB"
+    return f"{size / (1024 * 1024):.1f} MiB"
+
+
+def _warn_if_large_tarball(path: str) -> None:
+    limit = _tarball_warn_bytes()
+    if not limit:
+        return
+    size = os.path.getsize(path)
+    if size <= limit:
+        return
+    typer.secho(
+        "[stage] warning: source tarball is "
+        f"{_format_bytes(size)}; this may exceed cloud job/env limits. "
+        "Move large artifacts outside src/ or extend the data_mover exclude "
+        "suffix list. Set NEMOTRON_SRC_TARBALL_WARN_BYTES=0 to disable.",
+        fg=typer.colors.YELLOW,
+        err=True,
+    )
+
+
+@dataclass(frozen=True)
+class _ScriptLocation:
+    package: str
+    collection: str | None
+    branch: str | None
+
+
+def _repo_relative_path(repo_root: Path, path: str) -> Path:
+    candidate = Path(path)
+    if candidate.is_absolute():
+        try:
+            return candidate.relative_to(repo_root)
+        except ValueError:
+            if "src" in candidate.parts:
+                return Path(*candidate.parts[candidate.parts.index("src") :])
+    return candidate
+
+
+def _script_location(repo_root: Path, script_path: str | None) -> _ScriptLocation | None:
+    if not script_path:
+        return None
+    rel = _repo_relative_path(repo_root, script_path)
+    parts = rel.parts
+    if len(parts) < 3 or parts[0] != "src":
+        return None
+    collection = parts[2] if len(parts) >= 4 else None
+    branch = parts[3] if len(parts) >= 5 else None
+    return _ScriptLocation(
+        package=parts[1],
+        collection=collection,
+        branch=branch,
+    )
+
+
+def _include_collection(
+    includes: list[str],
+    *,
+    pkg_name: str,
+    collection_name: str,
+    collection_dir: Path,
+    branch: str | None = None,
+) -> None:
+    prefix = f"src/{pkg_name}/{collection_name}"
+    for child in sorted(collection_dir.iterdir()):
+        if child.name in _EXCLUDE_NAMES:
+            continue
+        if branch is None or child.is_file() or child.name == branch or child.name.startswith("_"):
+            includes.append(f"{prefix}/{child.name}")
+
+
 def _auto_includes(repo_root: Path, script_path: str | None) -> list[str]:
     """Discover repo-relative paths to ship.
 
-    Walks ``<repo>/src/*`` and ships every top-level package. For packages
-    with ``recipes/`` or ``steps/``, only the active recipe family or step
-    subtree from ``script_path`` is included when possible. This keeps the
-    tarball small because unrelated families and steps can weigh many MiB.
+    Walks ``<repo>/src/*`` and ships every top-level package. For packages with
+    large source collections such as ``recipes/`` or ``steps/``, only the active
+    branch from ``script_path`` is included when possible. This keeps the
+    tarball small because unrelated runnable collections can weigh many MiB.
     """
     src = repo_root / "src"
     if not src.is_dir():
         raise ValueError(f"No src/ under {repo_root}. Set repo_root in env.toml.")
 
     includes: list[str] = []
-
-    # Filters keyed off script_path: ship only the active recipe family or
-    # active step subtree. Lepton's etcd has a hard request cap (~1.5 MiB);
-    # DGXCloud has a tighter per-env-var cap. Without filtering, every
-    # unrelated recipe family + step ships and blows past those limits.
-    family = None
-    step_path: str | None = None
-    if script_path:
-        parts = Path(script_path).parts
-        if "recipes" in parts:
-            idx = parts.index("recipes")
-            if idx + 1 < len(parts):
-                family = parts[idx + 1]
-        elif "steps" in parts:
-            idx = parts.index("steps")
-            tail = parts[idx + 1 : -1]  # drop the step.py filename
-            if tail:
-                step_path = "/".join(tail)
+    script = _script_location(repo_root, script_path)
 
     for pkg in sorted(p for p in src.iterdir() if p.is_dir() and p.name not in _EXCLUDE_NAMES):
-        recipes = pkg / "recipes"
-        steps = pkg / "steps"
-        has_recipes = recipes.is_dir()
-        has_steps = steps.is_dir()
+        active_collection = (
+            script.collection
+            if script and script.package == pkg.name and script.collection and script.branch
+            else None
+        )
+        collection_names = {name for name in _SCOPED_COLLECTIONS if (pkg / name).is_dir()}
+        if active_collection and (pkg / active_collection).is_dir():
+            collection_names.add(active_collection)
 
-        if has_recipes or has_steps:
+        if collection_names:
             for child in sorted(pkg.iterdir()):
-                if child.name in _EXCLUDE_NAMES or child == recipes or child == steps:
+                if child.name in _EXCLUDE_NAMES or child.name in collection_names:
                     continue
                 includes.append(f"src/{pkg.name}/{child.name}")
 
-            if has_recipes:
-                for child in sorted(recipes.iterdir()):
-                    if child.is_file():
-                        includes.append(f"src/{pkg.name}/recipes/{child.name}")
-                if family and (recipes / family).is_dir():
-                    chosen_families = [family]
-                elif step_path:
-                    # Shipping a step — don't drag any recipe family along.
-                    chosen_families = []
-                else:
-                    chosen_families = [
-                        c.name for c in recipes.iterdir() if c.is_dir() and c.name not in _EXCLUDE_NAMES
-                    ]
-                for fam in sorted(chosen_families):
-                    includes.append(f"src/{pkg.name}/recipes/{fam}")
-
-            if has_steps:
-                # Top-level files (e.g. index.py, types.toml) always ride along.
-                for child in sorted(steps.iterdir()):
-                    if child.is_file():
-                        includes.append(f"src/{pkg.name}/steps/{child.name}")
-                if step_path and (steps / step_path).is_dir():
-                    # Active step's leaf + ancestor ``__init__.py`` files so
-                    # ``python -m nemotron.steps.<a>.<b>.step`` can traverse
-                    # the package path. Without these the runner imports the
-                    # leaf module directly but Python can't resolve the chain.
-                    parts = Path(step_path).parts
-                    for i in range(1, len(parts)):
-                        ancestor = "/".join(parts[:i])
-                        ancestor_dir = steps / ancestor
-                        for child in sorted(ancestor_dir.iterdir()):
-                            if child.is_file() and (
-                                child.name == "__init__.py" or child.name.startswith("_")
-                            ):
-                                includes.append(f"src/{pkg.name}/steps/{ancestor}/{child.name}")
-                    includes.append(f"src/{pkg.name}/steps/{step_path}")
-                    # Any shared-infra sibling (``_runners/`` etc.) that step
-                    # wrappers import from — leading-underscore convention.
-                    for child in sorted(steps.iterdir()):
-                        if child.is_dir() and child.name.startswith("_") and child.name not in _EXCLUDE_NAMES:
-                            includes.append(f"src/{pkg.name}/steps/{child.name}")
-                else:
-                    # No active step (e.g. shipping a recipe) — include all.
-                    for child in sorted(steps.iterdir()):
-                        if child.is_dir() and child.name not in _EXCLUDE_NAMES:
-                            includes.append(f"src/{pkg.name}/steps/{child.name}")
+            for collection_name in sorted(collection_names):
+                collection_dir = pkg / collection_name
+                is_active = active_collection == collection_name and script and script.branch
+                if active_collection and not is_active:
+                    continue
+                _include_collection(
+                    includes,
+                    pkg_name=pkg.name,
+                    collection_name=collection_name,
+                    collection_dir=collection_dir,
+                    branch=script.branch if is_active else None,
+                )
         else:
             includes.append(f"src/{pkg.name}")
-    return includes
+    return list(dict.fromkeys(includes))
 
 
 @dataclass(kw_only=True)
@@ -180,6 +236,7 @@ def package(self, path, job_dir, name):  # type: ignore[override]
             with tarfile.open(out, "w:gz") as tf:
                 for rel in _auto_includes(root, self.script_path):
                     tf.add(root / rel, arcname=rel, filter=_tar_filter)
+        _warn_if_large_tarball(out)
         return out
 
 
@@ -265,6 +322,7 @@ def plan_for(
         return Plan(
             packager=run.Packager(),
             pod_src_root=pod_src,
+            needs_pwd_symlinks=True,
             source_ready_marker=ready_marker,
             pre_script_cmds=[
                 'if [ "${NODE_RANK:-0}" = "0" ]; then'
diff --git a/src/nemo_runspec/execution.py b/src/nemo_runspec/execution.py
index 89ea6c916..f80e26bf0 100644
--- a/src/nemo_runspec/execution.py
+++ b/src/nemo_runspec/execution.py
@@ -32,11 +32,13 @@
 from __future__ import annotations
 
 import base64
+import hashlib
 import json
 import logging
 import os
 import shlex
 import subprocess
+import uuid
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Any
@@ -935,9 +937,10 @@ def _create_lepton_executor(
     if custom_spec:
         executor_kwargs["custom_spec"] = _to_plain(custom_spec)
 
-    # Pre-launch commands: user-defined + auto_mount git repos
+    # Keep Lepton executor pre-launch user-controlled. auto_mount git repos are
+    # cloned in the inline launch script so config decoding, source staging, and
+    # repo setup happen in one predictable order.
     pre_launch = list(_get_env(env, "pre_launch_commands") or [])
-    pre_launch.extend(_git_mount_commands())
     if pre_launch:
         executor_kwargs["pre_launch_commands"] = pre_launch
 
@@ -1058,6 +1061,12 @@ def _cloud_script_path(script_path: str, pod_src_root: str) -> str:
     return script_path
 
 
+def _cloud_config_path(nemotron_home: str, config_bytes: bytes) -> str:
+    """Return a per-submission config path on shared cloud storage."""
+    digest = hashlib.sha256(config_bytes).hexdigest()[:16]
+    return f"{nemotron_home}/config-{digest}-{uuid.uuid4().hex[:8]}.yaml"
+
+
 def _ray_node_source_sync_cmd(pod_src_root: str, ready_marker: str | None) -> str:
     """Return a shell command that ensures chunked source exists on every Ray node.
 
@@ -1145,6 +1154,16 @@ def _extract_on_node(raw_bytes, dest, marker):
     return f"python3 -c {shlex.quote(code)}"
 
 
+def _pwd_symlink_cmd(nemotron_home: str, pod_src_root: str) -> str:
+    """Expose staged source at ``$PWD/src`` for configs using ``${oc.env:PWD}``."""
+    return (
+        f"mkdir -p {nemotron_home}/src"
+        f" && rm -rf {nemotron_home}/src/nemotron {nemotron_home}/src/nemo_runspec"
+        f" && ln -sfn {pod_src_root}/nemotron {nemotron_home}/src/nemotron"
+        f" && ln -sfn {pod_src_root}/nemo_runspec {nemotron_home}/src/nemo_runspec"
+    )
+
+
 def execute_cloud(
     script_path: str,
     train_path: Path,
@@ -1186,10 +1205,11 @@ def execute_cloud(
     # ── 1. Workspace & paths ─────────────────────────────────────────
     workspace = _derive_cloud_workspace(env)
     nemotron_home = f"{workspace}/_nemotron"
-    config_path = f"{nemotron_home}/config.yaml"
+    config_bytes = train_path.read_bytes()
+    config_path = _cloud_config_path(nemotron_home, config_bytes)
 
     # ── 2. Config + source transport ────────────────────────────────
-    env_vars["_NEMOTRON_CONFIG_B64"] = base64.b64encode(train_path.read_bytes()).decode("ascii")
+    env_vars["_NEMOTRON_CONFIG_B64"] = base64.b64encode(config_bytes).decode("ascii")
     transport = data_mover.plan_for(
         executor_type=executor_type or "",
         env_vars=env_vars,
@@ -1279,11 +1299,7 @@ def execute_cloud(
         f" && export RAY_RUNTIME_ENV_PYTHONPATH={transport.pod_src_root}"
     )
     if transport.needs_pwd_symlinks:
-        launch_cmd += (
-            f" && mkdir -p {nemotron_home}/src"
-            f" && ln -sfn {transport.pod_src_root}/nemotron {nemotron_home}/src/nemotron"
-            f" && ln -sfn {transport.pod_src_root}/nemo_runspec {nemotron_home}/src/nemo_runspec"
-        )
+        launch_cmd += f" && {_pwd_symlink_cmd(nemotron_home, transport.pod_src_root)}"
     launch_cmd += f" && export PWD={nemotron_home} && cd {nemotron_home} && {script_cmd}"
     parts.append(launch_cmd)
 
@@ -1350,9 +1366,10 @@ def execute_cloud_ray(
 
     workspace = _derive_cloud_workspace(env)
     nemotron_home = f"{workspace}/_nemotron"
-    config_path = f"{nemotron_home}/config.yaml"
+    config_bytes = train_path.read_bytes()
+    config_path = _cloud_config_path(nemotron_home, config_bytes)
 
-    env_vars["_NEMOTRON_CONFIG_B64"] = base64.b64encode(train_path.read_bytes()).decode("ascii")
+    env_vars["_NEMOTRON_CONFIG_B64"] = base64.b64encode(config_bytes).decode("ascii")
 
     # Same source-transport strategy selection as the non-Ray path.
     transport = data_mover.plan_for(
@@ -1415,11 +1432,7 @@ def execute_cloud_ray(
     if transport.needs_pwd_symlinks:
         # Native-packager path: source sits at /nemo_run/code/src; symlink it
         # under nemotron_home/src so ${oc.env:PWD}/src/... still resolves.
-        head_setup.append(
-            f"mkdir -p {nemotron_home}/src"
-            f" && ln -sfn {transport.pod_src_root}/nemotron {nemotron_home}/src/nemotron"
-            f" && ln -sfn {transport.pod_src_root}/nemo_runspec {nemotron_home}/src/nemo_runspec"
-        )
+        head_setup.append(_pwd_symlink_cmd(nemotron_home, transport.pod_src_root))
     if startup_commands:
         head_setup.extend(startup_commands)
     full_cmd = " && ".join(
diff --git a/src/nemotron/cli/README.md b/src/nemotron/cli/README.md
index 23140f15b..cbaa532ca 100644
--- a/src/nemotron/cli/README.md
+++ b/src/nemotron/cli/README.md
@@ -21,7 +21,11 @@ nemotron = "nemotron.cli.bin.nemotron:main"
 
 ```
 nemotron
-├── byob                     # Bring-your-own benchmark generation and translation
+├── steps                    # Generic step catalog (list / show / run / lint)
+│   ├── list                 # List discovered steps
+│   ├── show                 # Show step manifest + runspec
+│   ├── run                  # Run any step (e.g. byob/mcq, translate/nemo_curator, sft/automodel)
+│   └── lint                 # Static checks on step manifests
 ├── nano3                    # Nano3 training recipe
 │   ├── pretrain             # Stage 0: Pretraining
 │   ├── sft                  # Stage 1: Supervised fine-tuning
@@ -102,10 +106,10 @@ uv run nemotron nano3 pretrain -c tiny --dry-run
 # Override config values
 uv run nemotron nano3 pretrain -c tiny train.train_iters=5000
 
-# BYOB benchmark generation
-uv run nemotron byob --family mcq --stage prepare --config src/nemotron/steps/byob/config/default.yaml
-uv run nemotron byob --family mcq --stage generate --config src/nemotron/steps/byob/config/default.yaml
-uv run nemotron byob --list-families
+# BYOB benchmark generation (via the generic step dispatcher)
+uv run nemotron steps run byob/mcq -c default stage=prepare family=mcq
+uv run nemotron steps run byob/mcq -c default stage=generate family=mcq
+uv run nemotron steps show byob/mcq        # parameters include family.choices
 
 # Data preparation
 uv run nemotron nano3 data prep pretrain --run MY-CLUSTER
diff --git a/src/nemotron/cli/bin/nemotron.py b/src/nemotron/cli/bin/nemotron.py
index 30f9f8d0c..801289834 100644
--- a/src/nemotron/cli/bin/nemotron.py
+++ b/src/nemotron/cli/bin/nemotron.py
@@ -117,7 +117,6 @@ def _register_groups() -> None:
         ("super3", "nemotron.cli.commands.super3", "super3_app"),
         ("kit", "nemotron.cli.kit", "kit_app"),
         ("embed", "nemotron.cli.commands.embed", "embed_app"),
-        ("step", "nemotron.cli.commands.step", "step_app"),
         ("steps", "nemotron.cli.commands.steps", "steps_app"),
     )
 
@@ -128,14 +127,6 @@ def _register_groups() -> None:
             if debug:
                 typer.echo(f"[nemotron] skipped '{name}' group: {exc}", err=True)
 
-    try:
-        from nemotron.cli.commands.byob import byob
-    except Exception as exc:
-        if debug:
-            typer.echo(f"[nemotron] skipped 'byob' command: {exc}", err=True)
-    else:
-        app.command(name="byob", rich_help_panel="Benchmarking")(byob)
-
 
 # Register groups on import
 _register_groups()
diff --git a/src/nemotron/cli/commands/byob.py b/src/nemotron/cli/commands/byob.py
deleted file mode 100644
index 1078a5712..000000000
--- a/src/nemotron/cli/commands/byob.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
-
-"""BYOB benchmark command."""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Annotated
-
-import typer
-
-from nemotron.steps.byob.scripts.runtime import list_family_names, run_byob
-
-VALID_STAGES = ("prepare", "generate", "translate")
-
-
-def byob(
-    config: Annotated[
-        Path | None,
-        typer.Option(
-            "--config",
-            "-c",
-            help="Path to the BYOB YAML config.",
-        ),
-    ] = None,
-    family: Annotated[
-        str,
-        typer.Option(
-            "--family",
-            help="Benchmark family to run.",
-        ),
-    ] = "mcq",
-    stage: Annotated[
-        str | None,
-        typer.Option(
-            "--stage",
-            help="Pipeline stage to run: prepare, generate, or translate.",
-        ),
-    ] = None,
-    skip_until: Annotated[
-        str | None,
-        typer.Option(
-            "--skip-until",
-            help="Resume from a family-specific stage enum name, such as JUDGEMENT or QUALITY_METRICS.",
-        ),
-    ] = None,
-    list_families: Annotated[
-        bool,
-        typer.Option(
-            "--list-families",
-            help="List registered BYOB benchmark families.",
-        ),
-    ] = False,
-) -> None:
-    """Run BYOB benchmark generation or translation."""
-
-    if list_families:
-        for registered_family in list_family_names():
-            typer.echo(registered_family)
-        return
-
-    if config is None:
-        typer.echo("Error: --config is required unless --list-families is set", err=True)
-        raise typer.Exit(1)
-
-    if stage is None:
-        typer.echo("Error: --stage is required unless --list-families is set", err=True)
-        raise typer.Exit(1)
-
-    if stage not in VALID_STAGES:
-        valid = ", ".join(VALID_STAGES)
-        typer.echo(f"Error: --stage must be one of: {valid}", err=True)
-        raise typer.Exit(1)
-
-    output_path = run_byob(
-        config=config,
-        stage=stage,
-        family=family,
-        skip_until=skip_until,
-    )
-    if output_path is not None:
-        typer.echo(output_path)
diff --git a/src/nemotron/cli/commands/step/__init__.py b/src/nemotron/cli/commands/step/__init__.py
deleted file mode 100644
index 9e1b49334..000000000
--- a/src/nemotron/cli/commands/step/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Generic step CLI — list / show / run any discovered step.
-
-Designed for agentic use: every step.py + step.toml in src/nemotron/steps/ is
-auto-discovered. The agent's surface is uniform regardless of the underlying
-framework (AutoModel, Megatron-Bridge, NeMo-RL, Data Designer).
-"""
-from __future__ import annotations
-
-import typer
-
-from nemotron.cli.commands.step.list_cmd import list_steps
-from nemotron.cli.commands.step.run_cmd import run_step
-from nemotron.cli.commands.step.show_cmd import show_step
-
-step_app = typer.Typer(
-    name="step",
-    help="Discover, inspect, and run any registered step.",
-    no_args_is_help=True,
-    rich_markup_mode="rich",
-    context_settings={"help_option_names": ["-h", "--help"]},
-)
-
-step_app.command("list", help="List discovered steps. Use --json for machine-readable output.")(list_steps)
-step_app.command("show", help="Show a step's manifest, runspec, and parameters.")(show_step)
-step_app.command(
-    "run",
-    help="Run a step on the chosen executor profile.",
-    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
-)(run_step)
-
-__all__ = ["step_app"]
diff --git a/src/nemotron/cli/commands/step/_resolve.py b/src/nemotron/cli/commands/steps/_resolve.py
similarity index 76%
rename from src/nemotron/cli/commands/step/_resolve.py
rename to src/nemotron/cli/commands/steps/_resolve.py
index 475721f95..01ea63ce6 100644
--- a/src/nemotron/cli/commands/step/_resolve.py
+++ b/src/nemotron/cli/commands/steps/_resolve.py
@@ -15,6 +15,8 @@
 """Shared step-id → StepInfo resolution helpers."""
 from __future__ import annotations
 
+import difflib
+
 import typer
 
 from nemotron.steps.index import StepInfo, discover_steps
@@ -23,11 +25,12 @@
 def resolve_step(step_id: str) -> StepInfo:
     """Find a step by id, with a helpful error if missing.
 
-    Accepts the canonical id (peft/automodel) or the directory tail (automodel)
-    when unambiguous.
+    Accepts the canonical id (``peft/automodel``) or the directory tail
+    (``automodel``) when unambiguous.
     """
     steps = discover_steps()
     by_id = {s.id: s for s in steps}
+
     if step_id in by_id:
         return by_id[step_id]
 
@@ -36,7 +39,9 @@ def resolve_step(step_id: str) -> StepInfo:
     if len(tail_matches) == 1:
         return tail_matches[0]
 
-    available = ", ".join(sorted(by_id))
     typer.echo(f"Unknown step id: {step_id}", err=True)
-    typer.echo(f"Available: {available}", err=True)
+    suggestions = difflib.get_close_matches(step_id, sorted(by_id), n=3, cutoff=0.5)
+    if suggestions:
+        typer.echo(f"Did you mean: {', '.join(suggestions)}?", err=True)
+    typer.echo("Run `nemotron steps list` to see all available steps.", err=True)
     raise typer.Exit(1)
diff --git a/src/nemotron/cli/commands/steps/_typer_group.py b/src/nemotron/cli/commands/steps/_typer_group.py
index 5296c479e..2a82785c8 100644
--- a/src/nemotron/cli/commands/steps/_typer_group.py
+++ b/src/nemotron/cli/commands/steps/_typer_group.py
@@ -16,15 +16,30 @@
 
 from __future__ import annotations
 
+import typer
+
 from nemo_runspec.recipe_typer import RecipeTyper
-from nemotron.cli.commands.steps.translation import META as TRANSLATION_META
-from nemotron.cli.commands.steps.translation import translation
+from nemotron.cli.commands.steps.list_cmd import list_steps
+from nemotron.cli.commands.steps.run_cmd import run_step
+from nemotron.cli.commands.steps.show_cmd import show_step
+
+
+def _add_catalog_commands(app: typer.Typer) -> None:
+    app.command("list", help="List discovered steps. Use --json for machine-readable output or --tree for a grouped view.")(list_steps)
+    app.command("show", help="Show a step's manifest, runspec, and parameters.")(show_step)
+    app.command(
+        "run",
+        help="Run a step on the chosen executor profile.",
+        context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+    )(run_step)
+
 
 steps_app = RecipeTyper(
     name="steps",
-    help="Agentic workflow steps",
+    help="Discover, inspect, run, and compose agentic workflow steps.",
     no_args_is_help=True,
     rich_markup_mode="rich",
+    context_settings={"help_option_names": ["-h", "--help"]},
 )
 
-steps_app.add_recipe_command(translation, meta=TRANSLATION_META)
+_add_catalog_commands(steps_app)
diff --git a/src/nemotron/cli/commands/step/backends/__init__.py b/src/nemotron/cli/commands/steps/backends/__init__.py
similarity index 67%
rename from src/nemotron/cli/commands/step/backends/__init__.py
rename to src/nemotron/cli/commands/steps/backends/__init__.py
index eaabbde89..3b1ddac1c 100644
--- a/src/nemotron/cli/commands/step/backends/__init__.py
+++ b/src/nemotron/cli/commands/steps/backends/__init__.py
@@ -12,24 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Backend protocol + registry for ``nemotron step run``.
+"""Backend protocol + registry for ``nemotron steps run``.
 
 A ``Backend`` knows how to take a parsed step (script + runspec + rendered
-config + env profile) and submit it for execution. ``step run`` selects a
+config + env profile) and submit it for execution. ``steps run`` selects a
 backend by name from the env profile's ``executor`` field and calls
 :meth:`Backend.submit`. Adding a new backend is one new file under this
-package — ``step run`` itself does not change.
+package — ``steps run`` itself does not change.
 """
 from __future__ import annotations
 
-from nemotron.cli.commands.step.backends.base import Backend, JobContext
-from nemotron.cli.commands.step.backends.cloud import CloudBackend
-from nemotron.cli.commands.step.backends.local import LocalBackend
-from nemotron.cli.commands.step.backends.registry import get_backend, register
+from nemotron.cli.commands.steps.backends.base import Backend, JobContext
+from nemotron.cli.commands.steps.backends.cloud import CloudBackend
+from nemotron.cli.commands.steps.backends.local import LocalBackend
+from nemotron.cli.commands.steps.backends.registry import get_backend, register
 
 # Built-in backends are registered here so a fresh import sees them all.
 register("local", LocalBackend)
-register("slurm", "nemotron.cli.commands.step.backends.slurm:SlurmBackend")
+register("slurm", "nemotron.cli.commands.steps.backends.slurm:SlurmBackend")
 register("lepton", CloudBackend)
 register("dgxcloud", CloudBackend)
 
diff --git a/src/nemotron/cli/commands/step/backends/base.py b/src/nemotron/cli/commands/steps/backends/base.py
similarity index 97%
rename from src/nemotron/cli/commands/step/backends/base.py
rename to src/nemotron/cli/commands/steps/backends/base.py
index 727f087ea..1c8b510c0 100644
--- a/src/nemotron/cli/commands/step/backends/base.py
+++ b/src/nemotron/cli/commands/steps/backends/base.py
@@ -28,7 +28,7 @@ class JobContext:
     config. Backends never poke at globals; they read from this.
     """
 
-    step_id: str                       # e.g. "prep/sft_packing"
+    step_id: str                       # e.g. "data_prep/sft_packing"
     script_path: Path                  # absolute path to the local step.py
     train_path: Path                   # absolute path to the rendered train.yaml
     spec: Any                          # nemo_runspec.Runspec
diff --git a/src/nemotron/cli/commands/step/backends/cloud.py b/src/nemotron/cli/commands/steps/backends/cloud.py
similarity index 89%
rename from src/nemotron/cli/commands/step/backends/cloud.py
rename to src/nemotron/cli/commands/steps/backends/cloud.py
index 0a1208a6b..c079af721 100644
--- a/src/nemotron/cli/commands/step/backends/cloud.py
+++ b/src/nemotron/cli/commands/steps/backends/cloud.py
@@ -24,7 +24,7 @@
   distributed workload; ``execute_cloud`` handles its own torchrun wrap
   for ``launch = "torchrun"`` and a bare python invocation otherwise.
 
-Preparation steps are the exception: Xenna owns its own Ray initialization
+Data preparation steps are the exception: Xenna owns its own Ray initialization
 inside the worker pod, so cloud prep runs as a plain inline workload even when
 the local/Slurm runspec uses ``launch = "ray"``.
 
@@ -36,7 +36,7 @@
 from pathlib import Path
 
 from nemo_runspec.execution import execute_cloud, execute_cloud_ray
-from nemotron.cli.commands.step.backends.base import JobContext
+from nemotron.cli.commands.steps.backends.base import JobContext
 
 
 class CloudBackend:
@@ -62,7 +62,7 @@ def submit(self, ctx: JobContext) -> None:
             )
             return
 
-        # Plain distributed-workload path. Prep steps intentionally arrive here
+        # Plain distributed-workload path. Data prep steps intentionally arrive here
         # even with launch="ray": Xenna starts Ray inside the single cloud pod,
         # avoiding a Lepton RayCluster whose workers may not share Python deps.
         execute_cloud(
@@ -82,15 +82,15 @@ def submit(self, ctx: JobContext) -> None:
     @staticmethod
     def _uses_inline_cloud(ctx: JobContext) -> bool:
         """Return True for steps that should not create cloud RayClusters."""
-        return ctx.step_id.startswith("prep/")
+        return ctx.step_id.startswith("data_prep/")
 
     @staticmethod
     def _pod_relative_script(script_path: str) -> str:
         """Strip the local repo root so the cloud pod's cwd resolves the script.
 
-        Drivers see e.g. ``/home/.../src/nemotron/steps/prep/sft_packing/step.py``
+        Drivers see e.g. ``/home/.../src/nemotron/steps/data_prep/sft_packing/step.py``
         but the pod's workspace is the repo root, so we want
-        ``src/nemotron/steps/prep/sft_packing/step.py`` instead.
+        ``src/nemotron/steps/data_prep/sft_packing/step.py`` instead.
         """
         path = Path(script_path)
         parts = path.parts
diff --git a/src/nemotron/cli/commands/step/backends/local.py b/src/nemotron/cli/commands/steps/backends/local.py
similarity index 96%
rename from src/nemotron/cli/commands/step/backends/local.py
rename to src/nemotron/cli/commands/steps/backends/local.py
index ede023ae4..35a6bc2eb 100644
--- a/src/nemotron/cli/commands/step/backends/local.py
+++ b/src/nemotron/cli/commands/steps/backends/local.py
@@ -22,7 +22,7 @@
 import sys
 
 from nemo_runspec.execution import execute_local
-from nemotron.cli.commands.step.backends.base import JobContext
+from nemotron.cli.commands.steps.backends.base import JobContext
 
 
 class LocalBackend:
diff --git a/src/nemotron/cli/commands/step/backends/registry.py b/src/nemotron/cli/commands/steps/backends/registry.py
similarity index 96%
rename from src/nemotron/cli/commands/step/backends/registry.py
rename to src/nemotron/cli/commands/steps/backends/registry.py
index 1f754982e..0ecf57673 100644
--- a/src/nemotron/cli/commands/step/backends/registry.py
+++ b/src/nemotron/cli/commands/steps/backends/registry.py
@@ -25,7 +25,7 @@
 
 import typer
 
-from nemotron.cli.commands.step.backends.base import Backend
+from nemotron.cli.commands.steps.backends.base import Backend
 
 _BackendFactory = type[Backend] | Callable[[], Backend] | str
 _REGISTRY: dict[str, _BackendFactory] = {}
diff --git a/src/nemotron/cli/commands/step/backends/slurm.py b/src/nemotron/cli/commands/steps/backends/slurm.py
similarity index 78%
rename from src/nemotron/cli/commands/step/backends/slurm.py
rename to src/nemotron/cli/commands/steps/backends/slurm.py
index 39126a77b..91a4265d7 100644
--- a/src/nemotron/cli/commands/step/backends/slurm.py
+++ b/src/nemotron/cli/commands/steps/backends/slurm.py
@@ -32,7 +32,10 @@
     CodePackager,
     SelfContainedPackager,
 )
-from nemotron.cli.commands.step.backends.base import JobContext
+from nemotron.cli.commands.steps.backends.base import JobContext
+
+_CURATOR_RUNTIME_MODULE = "nemotron.steps._bootstrap.curator_runtime"
+_REMOTE_SRC_DIR = "/nemo_run/code/src"
 
 
 class SlurmBackend:
@@ -134,8 +137,12 @@ def _build_cmd(ctx: JobContext) -> str:
         (so WORLD_SIZE matches the slurm allocation), bare ``python``
         otherwise.
         """
-        if ctx.spec.run.cmd is not None:
-            return ctx.spec.run.cmd.format(script=REMOTE_SCRIPT, config=REMOTE_CONFIG)
+        command_template = SlurmBackend._command_template(ctx)
+        if command_template is not None:
+            command = command_template.format(script=REMOTE_SCRIPT, config=REMOTE_CONFIG)
+            if SlurmBackend._command_uses_curator_runtime(command_template):
+                return SlurmBackend._with_remote_src_pythonpath(command)
+            return command
         if ctx.spec.run.launch == "torchrun":
             # nemo-run's torchrun launcher is set on the executor and handles
             # the actual srun-side wrap; on this code path we just feed the
@@ -143,10 +150,38 @@ def _build_cmd(ctx: JobContext) -> str:
             return f"python {REMOTE_SCRIPT} --config {REMOTE_CONFIG}"
         return f"python {REMOTE_SCRIPT} --config {REMOTE_CONFIG}"
 
+    @staticmethod
+    def _env_get(env: object, key: str, default: object = None) -> object:
+        if env is None:
+            return default
+        if hasattr(env, "get"):
+            return env.get(key, default)
+        return getattr(env, key, default)
+
     @staticmethod
     def _uses_code_packager(ctx: JobContext) -> bool:
-        """Prep steps start Ray internally, so workers need importable modules."""
-        return ctx.step_id.startswith("prep/")
+        """Data prep steps start Ray internally, so workers need importable modules."""
+        return ctx.step_id.startswith("data_prep/") or SlurmBackend._uses_curator_runtime(ctx)
+
+    @staticmethod
+    def _uses_curator_runtime(ctx: JobContext) -> bool:
+        command_template = SlurmBackend._command_template(ctx)
+        return SlurmBackend._command_uses_curator_runtime(command_template)
+
+    @staticmethod
+    def _command_template(ctx: JobContext) -> str | None:
+        if ctx.spec.run.cmd is not None:
+            return ctx.spec.run.cmd
+        run_command = SlurmBackend._env_get(ctx.env, "run_command")
+        return run_command if isinstance(run_command, str) and run_command else None
+
+    @staticmethod
+    def _command_uses_curator_runtime(command: object) -> bool:
+        return isinstance(command, str) and _CURATOR_RUNTIME_MODULE in command
+
+    @staticmethod
+    def _with_remote_src_pythonpath(command: str) -> str:
+        return f"export PYTHONPATH={_REMOTE_SRC_DIR}${{PYTHONPATH:+:$PYTHONPATH}}; {command}"
 
     @staticmethod
     def _wait_for_ray_job(ray_job: object, *, poll_seconds: int = 30) -> str:
diff --git a/src/nemotron/cli/commands/step/list_cmd.py b/src/nemotron/cli/commands/steps/list_cmd.py
similarity index 62%
rename from src/nemotron/cli/commands/step/list_cmd.py
rename to src/nemotron/cli/commands/steps/list_cmd.py
index 053ba4f95..67ecd4d8b 100644
--- a/src/nemotron/cli/commands/step/list_cmd.py
+++ b/src/nemotron/cli/commands/steps/list_cmd.py
@@ -12,29 +12,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""`nemotron step list` — discovery for humans and agents."""
+"""`nemotron steps list` — discovery for humans and agents."""
 
 from __future__ import annotations
 
 import json as json_module
+from collections import defaultdict
 from typing import Annotated
 
 import typer
 from rich.console import Console
 from rich.table import Table
+from rich.tree import Tree
 
-from nemotron.steps.index import StepInfo, discover_steps
+from nemotron.steps.index import CATEGORY_TITLES, StepInfo, discover_steps
 
 console = Console()
 
 
-def _matches(step: StepInfo, *, category: str | None, consumes: str | None, produces: str | None) -> bool:
+def _matches(
+    step: StepInfo,
+    *,
+    category: str | None,
+    consumes: str | None,
+    produces: str | None,
+    tag: str | None,
+) -> bool:
     if category and step.category != category:
         return False
     if consumes and not any(a.type == consumes for a in step.consumes):
         return False
     if produces and not any(a.type == produces for a in step.produces):
         return False
+    if tag and tag not in step.tags:
+        return False
     return True
 
 
@@ -55,6 +66,38 @@ def _step_to_dict(step: StepInfo) -> dict:
     }
 
 
+def _render_table(steps: list[StepInfo]) -> None:
+    table = Table(title="Available Steps", show_lines=False)
+    table.add_column("ID", style="cyan", no_wrap=True)
+    table.add_column("Category")
+    table.add_column("Consumes")
+    table.add_column("Produces")
+    table.add_column("Description", overflow="fold")
+
+    for step in steps:
+        consumes_str = ", ".join(a.type for a in step.consumes) or "-"
+        produces_str = ", ".join(a.type for a in step.produces) or "-"
+        table.add_row(step.id, step.category, consumes_str, produces_str, step.description.split("\n")[0])
+
+    console.print(table)
+
+
+def _render_tree(steps: list[StepInfo]) -> None:
+    grouped: dict[str, list[StepInfo]] = defaultdict(list)
+    for step in steps:
+        grouped[step.category].append(step)
+
+    root = Tree("[bold]Available Steps[/bold]")
+    for category in sorted(grouped):
+        title = CATEGORY_TITLES.get(category, category)
+        branch = root.add(f"[cyan]{category}[/cyan] — {title}")
+        for step in grouped[category]:
+            summary = step.description.split("\n")[0] or "(no description)"
+            branch.add(f"[bold]{step.id}[/bold]  [dim]{summary}[/dim]")
+
+    console.print(root)
+
+
 def list_steps(
     category: Annotated[
         str | None,
@@ -68,9 +111,25 @@ def list_steps(
         str | None,
         typer.Option("--produces", help="Only steps that produce this artifact type."),
     ] = None,
-    as_json: Annotated[bool, typer.Option("--json", help="Emit JSON array (agent-friendly).")] = False,
+    tag: Annotated[
+        str | None,
+        typer.Option("--tag", help="Only steps whose manifest tags include this value."),
+    ] = None,
+    tree: Annotated[
+        bool,
+        typer.Option("--tree", help="Group steps by category in a tree view."),
+    ] = False,
+    as_json: Annotated[
+        bool, typer.Option("--json", help="Emit JSON array (agent-friendly).")
+    ] = False,
 ) -> None:
-    steps = [s for s in discover_steps() if _matches(s, category=category, consumes=consumes, produces=produces)]
+    """List discovered steps. Use ``--json`` for machine-readable output or ``--tree`` for a grouped view."""
+
+    steps = [
+        s
+        for s in discover_steps()
+        if _matches(s, category=category, consumes=consumes, produces=produces, tag=tag)
+    ]
 
     if as_json:
         typer.echo(json_module.dumps([_step_to_dict(s) for s in steps], indent=2))
@@ -80,16 +139,8 @@ def list_steps(
         console.print("[yellow]No steps matched.[/yellow]")
         return
 
-    table = Table(title="Available Steps", show_lines=False)
-    table.add_column("ID", style="cyan", no_wrap=True)
-    table.add_column("Category")
-    table.add_column("Consumes")
-    table.add_column("Produces")
-    table.add_column("Description", overflow="fold")
-
-    for step in steps:
-        consumes_str = ", ".join(a.type for a in step.consumes) or "-"
-        produces_str = ", ".join(a.type for a in step.produces) or "-"
-        table.add_row(step.id, step.category, consumes_str, produces_str, step.description.split("\n")[0])
+    if tree:
+        _render_tree(steps)
+        return
 
-    console.print(table)
+    _render_table(steps)
diff --git a/src/nemotron/cli/commands/step/run_cmd.py b/src/nemotron/cli/commands/steps/run_cmd.py
similarity index 67%
rename from src/nemotron/cli/commands/step/run_cmd.py
rename to src/nemotron/cli/commands/steps/run_cmd.py
index 75a0e9b17..b4bfa2e75 100644
--- a/src/nemotron/cli/commands/step/run_cmd.py
+++ b/src/nemotron/cli/commands/steps/run_cmd.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""`nemotron step run` — generic step execution.
+"""`nemotron steps run` — generic step execution.
 
 Thin dispatcher. The job of this command is to:
   1. Resolve a step id → step.py + runspec.
@@ -21,12 +21,16 @@
   4. Hand off to ``backend.submit(ctx)``.
 
 All execution-mechanics live in the per-backend modules under
-``nemotron.cli.commands.step.backends.*``. To add a new submission target,
+``nemotron.cli.commands.steps.backends.*``. To add a new submission target,
 write one Backend subclass and ``register()`` it — no edits here.
+
+Overrides are passed as bare ``key=value`` positionals at the end of the
+command, e.g. ``nemotron steps run peft/automodel -c default train.train_iters=5000``.
 """
 
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Annotated
 
 import typer
@@ -43,8 +47,10 @@
 from nemo_runspec.display import display_job_config, display_job_submission
 from nemo_runspec.env import parse_env
 from nemo_runspec.execution import build_env_vars, get_startup_commands
-from nemotron.cli.commands.step._resolve import resolve_step
-from nemotron.cli.commands.step.backends import JobContext, get_backend
+from nemotron.cli.commands.steps._resolve import resolve_step
+from nemotron.cli.commands.steps.backends import JobContext, get_backend
+
+_CURATOR_RUNTIME_MODULE = "nemotron.steps._bootstrap.curator_runtime"
 
 
 def run_step(
@@ -111,6 +117,11 @@ def run_step(
 
     env_vars = build_env_vars(job_config, env_for_executor)
     startup_commands = list(get_startup_commands(env_for_executor) or [])
+    curator_runtime_env = _build_curator_runtime_env_vars(
+        script_path=script_path,
+        env=env_for_executor,
+        mode=global_ctx.mode,
+    )
 
     display_job_submission(
         job_path,
@@ -119,6 +130,7 @@ def run_step(
         global_ctx.mode,
         artifacts=job_config.get("artifacts"),
     )
+    env_vars.update(curator_runtime_env)
 
     executor_type = _executor_type(env_for_executor, default="local" if global_ctx.mode == "local" else None)
     if executor_type is None:
@@ -153,3 +165,52 @@ def _executor_type(env: object, *, default: str | None) -> str | None:
     if hasattr(env, "get"):
         return env.get("executor", default)
     return getattr(env, "executor", default)
+
+
+def _build_curator_runtime_env_vars(*, script_path: Path, env: object, mode: str) -> dict[str, str]:
+    """Build env-encoded Curator runtime requirements for remote submission."""
+    if mode not in {"run", "batch"} or not _uses_curator_runtime(env):
+        return {}
+
+    from nemotron.steps._bootstrap import runtime_payloads
+
+    source_checkout = _find_source_checkout_root(script_path)
+    if source_checkout is not None:
+        payloads = runtime_payloads.build_runtime_payloads(source_checkout)
+        source_description = str(source_checkout)
+    else:
+        payloads = runtime_payloads.read_runtime_payloads()
+        source_description = str(runtime_payloads.DEFAULT_OUTPUT_DIR)
+
+    if not payloads:
+        typer.echo(
+            "Curator runtime metadata is required for this remote profile, but none was found. "
+            "Run this command from a source checkout containing pyproject.toml and src/nemotron, "
+            "or install a wheel that includes nemotron.steps._bootstrap.runtime package data.",
+            err=True,
+        )
+        raise typer.Exit(1)
+
+    typer.echo(f"Prepared Curator runtime requirements from {source_description}")
+    return runtime_payloads.encode_runtime_payload_env(payloads)
+
+
+def _uses_curator_runtime(env: object) -> bool:
+    run_command = _env_get(env, "run_command")
+    return isinstance(run_command, str) and _CURATOR_RUNTIME_MODULE in run_command
+
+
+def _env_get(env: object, key: str, default: object = None) -> object:
+    if env is None:
+        return default
+    if hasattr(env, "get"):
+        return env.get(key, default)
+    return getattr(env, key, default)
+
+
+def _find_source_checkout_root(path: Path) -> Path | None:
+    resolved = path.resolve()
+    for candidate in (resolved.parent, *resolved.parents):
+        if (candidate / "pyproject.toml").is_file() and (candidate / "src" / "nemotron").is_dir():
+            return candidate
+    return None
diff --git a/src/nemotron/cli/commands/step/show_cmd.py b/src/nemotron/cli/commands/steps/show_cmd.py
similarity index 89%
rename from src/nemotron/cli/commands/step/show_cmd.py
rename to src/nemotron/cli/commands/steps/show_cmd.py
index 98f049e5a..c349bb1ac 100644
--- a/src/nemotron/cli/commands/step/show_cmd.py
+++ b/src/nemotron/cli/commands/steps/show_cmd.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""`nemotron step show` — full manifest + runspec for one step."""
+"""`nemotron steps show` — full manifest + runspec for one step."""
 from __future__ import annotations
 
 import json as json_module
@@ -23,8 +23,8 @@
 from rich.console import Console
 
 from nemo_runspec import parse as parse_runspec
-from nemotron.cli.commands.step._resolve import resolve_step
-from nemotron.cli.commands.step.list_cmd import _step_to_dict
+from nemotron.cli.commands.steps._resolve import resolve_step
+from nemotron.cli.commands.steps.list_cmd import _step_to_dict
 
 console = Console()
 
@@ -60,7 +60,10 @@ def show_step(
         console.print("\n[bold]Parameters[/bold]")
         for p in step.parameters:
             default = "" if p.default is None else f" (default={p.default})"
-            console.print(f"  • [yellow]{p.name}[/yellow]{default} — {p.description}")
+            choices = (
+                f" (choices: {', '.join(str(c) for c in p.choices)})" if p.choices else ""
+            )
+            console.print(f"  • [yellow]{p.name}[/yellow]{default}{choices} — {p.description}")
     if spec is not None:
         console.print("\n[bold]Runspec[/bold]")
         console.print(f"  launcher: [magenta]{spec.run.launch}[/magenta]")
diff --git a/src/nemotron/cli/commands/steps/translation.py b/src/nemotron/cli/commands/steps/translation.py
deleted file mode 100644
index a75c1206c..000000000
--- a/src/nemotron/cli/commands/steps/translation.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""CLI command for the Curator-backed translation step."""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-from omegaconf import DictConfig, OmegaConf
-import typer
-
-from nemo_runspec import parse as parse_runspec
-from nemo_runspec.config import parse_config
-from nemo_runspec.recipe_config import RecipeConfig, parse_recipe_config
-from nemo_runspec.recipe_typer import RecipeMeta
-
-SCRIPT_PATH = "src/nemotron/steps/translate/translation/step.py"
-SPEC = parse_runspec(SCRIPT_PATH)
-
-META = RecipeMeta(
-    name=SPEC.name,
-    script_path=SCRIPT_PATH,
-    config_dir=str(SPEC.config_dir),
-    default_config=SPEC.config.default,
-    input_artifacts={"data": "JSONL or Parquet corpus to translate"},
-    output_artifacts={"translated": "Translated JSONL or Parquet output shards"},
-)
-
-
-def _as_plain_dict(config: DictConfig) -> dict[str, Any]:
-    data = OmegaConf.to_container(config, resolve=True)
-    if not isinstance(data, dict):
-        raise TypeError("Translation config must be a mapping")
-    return data
-
-
-def _load_translation_config(cfg: RecipeConfig) -> dict[str, Any]:
-    """Load the translation step YAML and apply CLI dotlist overrides."""
-    return _as_plain_dict(parse_config(cfg.ctx, SPEC.config_dir, SPEC.config.default))
-
-
-def _run_translation_step(config: dict[str, Any]) -> Path:
-    """Call the checked-in translation step runtime."""
-    from nemotron.steps.translate.translation.step import run
-
-    return run(config)
-
-
-def translation(ctx: typer.Context) -> None:
-    """Run corpus translation with NeMo Curator.
-
-    Example:
-        nemotron steps translation \\
-          input_path=/data/source.jsonl \\
-          output_dir=/data/translated \\
-          source_language=en \\
-          target_language=hi
-    """
-    cfg = parse_recipe_config(ctx)
-
-    if cfg.mode != "local":
-        typer.echo(
-            "Error: nemotron steps translation currently supports local execution only.",
-            err=True,
-        )
-        raise typer.Exit(1)
-
-    if cfg.passthrough:
-        typer.echo(
-            "Error: nemotron steps translation accepts key=value config overrides only.",
-            err=True,
-        )
-        raise typer.Exit(1)
-
-    try:
-        config = _load_translation_config(cfg)
-    except FileNotFoundError as exc:
-        typer.echo(f"Error: {exc}", err=True)
-        raise typer.Exit(1) from exc
-
-    if cfg.dry_run:
-        typer.echo(OmegaConf.to_yaml(OmegaConf.create(config), resolve=True))
-        return
-
-    output_path = _run_translation_step(config)
-    typer.echo(f"Translation complete. Output: {output_path}")
diff --git a/src/nemotron/steps/PATTERNS.md b/src/nemotron/steps/PATTERNS.md
index bf3e93a80..6436f5d28 100644
--- a/src/nemotron/steps/PATTERNS.md
+++ b/src/nemotron/steps/PATTERNS.md
@@ -12,17 +12,17 @@
 | [eval-before-and-after-training](patterns/eval-before-and-after-training.md) | Evaluate before and after training | eval, pipeline-structure | You are about to train or adapt a model and need to prove improvement.<br>A pipeline includes SFT, RL, conversion, or any quality-changing stage.<br>You need to compare multiple training runs fairly. | high |
 | [eval-bookends](patterns/eval-bookends.md) | Evaluate before and after training | eval, pipeline-structure | You are about to train or adapt a model and need to prove improvement.<br>A pipeline includes SFT, RL, conversion, or any quality-changing stage.<br>You need to compare multiple training runs fairly. | high |
 | [multilingual-tokenizer-check](patterns/multilingual-tokenizer-check.md) | Check tokenizer coverage for multilingual training | tokenizer, multilingual, validation | Training data includes non-English text or mixed-language prompts.<br>You are adapting a mostly English base model to another language.<br>The target language uses scripts, spacing rules, or morphology unlike English. | high |
-| [pack-variable-length](patterns/pack-variable-length.md) | Pack variable-length SFT data | prep, sft, efficiency | Training examples range from very short to very long sequences.<br>GPU utilization is poor because padding dominates batches.<br>You are preparing data for Megatron-Bridge SFT with packed inputs available. | high |
+| [pack-variable-length](patterns/pack-variable-length.md) | Pack variable-length SFT data | data_prep, sft, efficiency | Training examples range from very short to very long sequences.<br>GPU utilization is poor because padding dominates batches.<br>You are preparing data for Megatron-Bridge SFT with packed inputs available. | high |
 | [peft-adapter-merge-discipline](patterns/peft-adapter-merge-discipline.md) | Keep adapter artifacts separate until merge is validated | peft, lora, convert, deployment | A LoRA or PEFT run needs to produce a standalone deployable checkpoint.<br>An adapter checkpoint is about to be merged into a base model.<br>Downstream evaluation or serving expects a full HuggingFace checkpoint rather than an adapter. | high |
 | [prefer-llm-for-structured-chat](patterns/prefer-llm-for-structured-chat.md) | Prefer LLM translation for structured chat data | translate, chat, structured-data | The input is OpenAI-style chat data, tool-calling transcripts, or nested message records.<br>The translated output must preserve JSON, code blocks, markup, or message structure.<br>The user wants to translate messages.*.content or another wildcard field path. | high |
 | [prefer-nmt-for-large-corpora](patterns/prefer-nmt-for-large-corpora.md) | Prefer NMT for large plain-text corpora | translate, nmt, throughput | The corpus is large, mostly plain text, and a local NMT service is available.<br>Translation throughput or cost matters more than nuanced instruction following.<br>The user mentions an IndicTrans, NMT, or local translation server. | high |
-| [prep-data-is-tokenizer-locked](patterns/prep-data-is-tokenizer-locked.md) | Treat prepared data as tokenizer-locked | prep, tokenizer, data-artifacts | You are reusing packed Parquet or bin/idx data after changing the tokenizer, chat template, or sequence length.<br>A downstream trainer reports shape, vocabulary, EOS, loss-mask, or data-prefix mismatches.<br>You need to decide whether an existing prepared dataset is still compatible with a new training config. | high |
+| [prep-data-is-tokenizer-locked](patterns/prep-data-is-tokenizer-locked.md) | Treat prepared data as tokenizer-locked | data_prep, tokenizer, data-artifacts | You are reusing packed Parquet or bin/idx data after changing the tokenizer, chat template, or sequence length.<br>A downstream trainer reports shape, vocabulary, EOS, loss-mask, or data-prefix mismatches.<br>You need to decide whether an existing prepared dataset is still compatible with a new training config. | high |
 | [pretrain-token-budget-before-scale](patterns/pretrain-token-budget-before-scale.md) | Define the pretraining token budget before scaling | pretrain, planning, scaling, budget | You are planning pretraining or continued pretraining beyond a smoke test.<br>A pretrain config is being scaled from tiny/local execution to multi-GPU or multi-node execution.<br>You need to choose between pretrain/automodel and pretrain/megatron_bridge.<br>Cluster cost or wall-clock budget is being requested for a pretrain run. | high |
 | [production-export-trt](patterns/production-export-trt.md) | Consider TensorRT-LLM export for production serving | convert, deploy, production | The end goal is low-latency or high-throughput production inference.<br>A trained model must move from experimentation into a serving stack.<br>You need better serving efficiency than a generic research checkpoint provides. | medium |
 | [rl-validate-rewards-before-scale](patterns/rl-validate-rewards-before-scale.md) | Validate RL rewards before scaling rollouts | rl, rewards, validation | An RLVR reward function, NeMo-Gym resource server, or learned reward model is being added.<br>Reward is improving but held-out examples or human review look worse.<br>A DPO, RLVR, or RLHF run is moving from tiny validation to production rollout counts. | high |
 | [sdg-pipeline-versioning](patterns/sdg-pipeline-versioning.md) | Version synthetic data generation as a pipeline | sdg, data-quality, reproducibility | Synthetic SFT, tool-use, prompt, or preference data is being generated.<br>A Data Designer config is moving from preview mode to a production-scale generation job.<br>Generated data will feed SFT, DPO, RLVR, RLHF, or downstream data prep.<br>A second SDG run needs to reproduce or extend an earlier corpus. | high |
 | [sft-data-blending](patterns/sft-data-blending.md) | Blend SFT data deliberately across capabilities | sft, data-blend, sovereign, capabilities | An SFT corpus combines instruction-following, chat, tool use, reasoning, and domain-specific data.<br>You are mixing translated/synthetic data with curated human-written data.<br>Sovereign / regional SFT data is being blended with broader open-source instruction sets.<br>After SFT the model loses one capability while gaining another. | high |
-| [sft-sequence-packing](patterns/sft-sequence-packing.md) | Pack variable-length SFT data | prep, sft, efficiency | Training examples range from very short to very long sequences.<br>GPU utilization is poor because padding dominates batches.<br>You are preparing data for Megatron-Bridge SFT with packed inputs available. | high |
+| [sft-sequence-packing](patterns/sft-sequence-packing.md) | Pack variable-length SFT data | data_prep, sft, efficiency | Training examples range from very short to very long sequences.<br>GPU utilization is poor because padding dominates batches.<br>You are preparing data for Megatron-Bridge SFT with packed inputs available. | high |
 | [sft-small-dataset-prefer-lora](patterns/sft-small-dataset-prefer-lora.md) | Prefer LoRA for small SFT datasets | sft, data-size, efficiency | The supervised fine-tuning dataset has fewer than 10,000 examples.<br>You want to adapt model behavior without rewriting broad world knowledge.<br>GPU budget is limited and full checkpoint churn would slow iteration. | high |
 | [small-dataset-lora](patterns/small-dataset-lora.md) | Prefer LoRA for small SFT datasets | sft, data-size, efficiency | The supervised fine-tuning dataset has fewer than 10,000 examples.<br>You want to adapt model behavior without rewriting broad world knowledge.<br>GPU budget is limited and full checkpoint churn would slow iteration. | high |
 | [translate-training-corpus](patterns/translate-training-corpus.md) | Translate corpora before multilingual training | translate, multilingual, data-prep | The user wants to translate a corpus, dataset, or chat records before CPT or SFT.<br>Training data must be produced in a target language from source-language examples.<br>A multilingual fine-tuning pipeline needs translated JSONL or Parquet artifacts. | high |
diff --git a/src/nemotron/steps/SKILL.md b/src/nemotron/steps/SKILL.md
index 780edab94..266042684 100644
--- a/src/nemotron/steps/SKILL.md
+++ b/src/nemotron/steps/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: nemotron-steps
-description: Navigate the Nemotron step library across prep, pretrain, SFT, PEFT, RL, synthetic data generation, and optimization. Use when planning end-to-end pipelines, choosing a backend, checking artifact compatibility, or finding the correct step SKILL.md, step.toml, runner, config, and upstream reference repo.
+description: Navigate the Nemotron step library across curation, data_prep, translation, pretrain, SFT, PEFT, RL, synthetic data generation, BYOB benchmarks, conversion, optimization, evaluation, and env setup. Use when planning end-to-end pipelines, choosing a backend, checking artifact compatibility, or finding the correct step SKILL.md, step.toml, runner, config, and upstream reference repo.
 ---
 
 # Nemotron Steps
@@ -11,18 +11,23 @@ Use this skill as the entry point for the Nemotron training and optimization ste
 
 | Need | Start With | Primary Artifacts |
 | --- | --- | --- |
-| SFT packing, pretrain bin/idx, RL sharding | `prep/SKILL.md` | `training_jsonl`, `packed_parquet`, `binidx` |
+| Lightweight text curation | `curate/nemo_curator/SKILL.md` | `raw_jsonl`, `filtered_jsonl` |
+| SFT packing, pretrain bin/idx, RL sharding | `data_prep/SKILL.md` | `training_jsonl`, `packed_parquet`, `binidx` |
+| Corpus translation and FAITH scoring | `translate/SKILL.md` | `filtered_jsonl`, `translated_jsonl` |
 | Pretraining or continued pretraining | `pretrain/SKILL.md` | `binidx`, `checkpoint_hf`, `checkpoint_megatron` |
 | Supervised fine-tuning | `sft/SKILL.md` | `training_jsonl`, `packed_parquet`, checkpoints |
 | LoRA or adapter tuning | `peft/SKILL.md` | `checkpoint_lora` |
 | DPO, RLVR, or RLHF alignment | `rl/SKILL.md` | prompt or preference JSONL, Megatron checkpoints |
 | SFT SDG or RL preference SDG | `sdg/SKILL.md` | `synthetic_jsonl` |
+| BYOB benchmark generation or translation | `byob/SKILL.md` | benchmark parquet artifacts |
+| Checkpoint format conversion or LoRA merge | `convert/SKILL.md` | `checkpoint_hf`, `checkpoint_megatron`, `checkpoint_lora` |
 | Quantization, distillation, pruning | `optimize/SKILL.md` | optimized HF or Megatron checkpoints |
+| Evaluation | `eval/model_eval/SKILL.md` | `eval_results` |
 | Execution profiles and Lepton/Ray env setup | `env/SKILL.md` | `env_toml` |
 
 ## Workflow
 
-1. For any Lepton, Slurm, Ray, or other non-local run, create or verify the env profile file first with `env/SKILL.md`. The default lookup is repository-root `env.toml`; generated backend examples use `env.lepton.toml` or `env.slurm.toml` and must be selected with `NEMOTRON_ENV_FILE`.
+1. For any Lepton, Slurm, DGX Cloud, Ray, or other non-local run, create or verify the env profile file first with `env/SKILL.md`. The default lookup is repository-root `env.toml`; generated backend examples use `env.lepton.toml`, `env.slurm.toml`, or `env.dgxcloud.toml` and must be selected with `NEMOTRON_ENV_FILE`.
 2. Read the most specific `SKILL.md` for the requested stage.
 3. Read that step's `step.toml` first to understand the flow: intent, consumed and produced artifacts, important parameters, strategies, failure modes, and upstream references. Treat it as the agent-facing contract before editing configs or step code.
 4. Start from `config/tiny.yaml` for runner validation and `config/default.yaml` for production shape.
diff --git a/src/nemotron/steps/STEPS.md b/src/nemotron/steps/STEPS.md
index 168d3b4fd..6c8ec938e 100644
--- a/src/nemotron/steps/STEPS.md
+++ b/src/nemotron/steps/STEPS.md
@@ -4,7 +4,7 @@
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
-| [byob](byob/) | Generate and translate BYOB MCQ benchmark parquet artifacts from domain documents with an extensible benchmark-family runtime. | benchmark_source_corpus, benchmark_parquet (optional) | mcq_benchmark_parquet, translated_mcq_benchmark_parquet (optional) |
+| [byob/mcq](byob/mcq/) | Generate and translate BYOB MCQ benchmark parquet artifacts from domain documents with an extensible benchmark-family runtime. | benchmark_source_corpus, benchmark_parquet (optional) | mcq_benchmark_parquet, translated_mcq_benchmark_parquet (optional) |
 
 ## convert — Conversion
 
@@ -12,25 +12,33 @@
 | --- | --- | --- | --- |
 | [convert/hf_to_megatron](convert/hf_to_megatron/) | Convert a HuggingFace safetensors checkpoint to Megatron distributed format. | checkpoint_hf | checkpoint_megatron |
 | [convert/megatron_to_hf](convert/megatron_to_hf/) | Convert a Megatron distributed checkpoint to HuggingFace safetensors format. | checkpoint_megatron | checkpoint_hf |
-| [convert/merge_lora](convert/merge_lora/) | Merge a LoRA adapter into the base model to produce a standalone HuggingFace checkpoint. | checkpoint_lora, checkpoint_hf | checkpoint_hf |
+| [convert/merge_lora](convert/merge_lora/) | Merge a LoRA adapter into its original base model, producing a standalone HuggingFace checkpoint. | checkpoint_lora, checkpoint_hf, checkpoint_megatron (optional) | checkpoint_hf, checkpoint_megatron (optional) |
 
 ## curate — Data Curation
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
-| [curate/nemo_curator](curate/nemo_curator/) | Acquire public or custom text corpora with NeMo Curator, then annotate and filter them by language, domain, and quality to produce downstream-ready JSONL. | - | filtered_jsonl |
+| [curate/nemo_curator](curate/nemo_curator/) | Read JSONL text with NeMo Curator, optionally hydrate a Hugging Face snapshot, apply light language, word-count, and domain filters, and write downstream-ready JSONL. | raw_jsonl | filtered_jsonl |
 
-## env
+## data_prep — Data Preparation
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
-| [env/env_toml](env/env_toml/) | Generate and validate step-linked env profile examples from compact YAML templates for Lepton or Slurm, including inheritance, image overrides, mounts, env-var placeholders, and Ray/RL guardrails. | - | env_toml |
+| [data_prep/pretrain_prep](data_prep/pretrain_prep/) | Tokenise raw text (HF datasets or local parquet/jsonl) into Megatron bin/idx shards and emit a blend.json that pretrain/megatron_bridge and pretrain/automodel can ingest directly. | filtered_jsonl | binidx |
+| [data_prep/rl_prep](data_prep/rl_prep/) | Resolve HuggingFace dataset references in an RL data blend and shard the output JSONL into the prompt / preference layout expected by rl/nemo_rl/*. | training_jsonl | training_jsonl |
+| [data_prep/sft_packing](data_prep/sft_packing/) | Apply the chat template, tokenize training JSONL, and pack examples into Megatron-Bridge-compatible Parquet shards for SFT. | training_jsonl | packed_parquet |
+
+## env — Environment Profiles
+
+| Step | Description | Consumes | Produces |
+| --- | --- | --- | --- |
+| [env/env_toml](env/env_toml/) | Generate and validate step-linked env profile examples from compact YAML templates for Lepton, Slurm, or DGX Cloud, including inheritance, image overrides, mounts, env-var placeholders, Curator/Data Designer profiles, and Ray/RL guardrails. | - | env_toml |
 
 ## eval — Evaluation
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
-| [eval/model_eval](eval/model_eval/) | Deploy a trained checkpoint behind an OpenAI-compatible endpoint and run benchmark suites with NeMo Evaluator, producing consolidated evaluation results. | checkpoint_megatron (optional), checkpoint_hf (optional) | eval_results |
+| [eval/model_eval](eval/model_eval/) | Deploy a Megatron Bridge checkpoint behind an OpenAI-compatible endpoint, or evaluate an existing hosted endpoint, with NeMo Evaluator Launcher. | checkpoint_megatron (optional) | eval_results |
 
 ## optimize — Model Optimization
 
@@ -45,15 +53,7 @@
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
 | [peft/automodel](peft/automodel/) | Parameter-efficient fine-tuning (LoRA) with the AutoModel stack. Same training loop as sft/automodel but with a LoRA adapter wired in by default, making larger HF backbones practical for adapter-based tuning. | training_jsonl | checkpoint_lora |
-| [peft/megatron_bridge](peft/megatron_bridge/) | Parameter-efficient fine-tuning (LoRA) on top of Megatron-Bridge. Useful when a full SFT exceeds memory but you still want TP/PP/CP scaling. Consumes packed Parquet from prep/sft_packing. | packed_parquet, checkpoint_megatron | checkpoint_lora |
-
-## prep — Data Preparation
-
-| Step | Description | Consumes | Produces |
-| --- | --- | --- | --- |
-| [prep/pretrain_prep](prep/pretrain_prep/) | Tokenise raw text (HF datasets or local parquet/jsonl) into Megatron bin/idx shards and emit a blend.json that pretrain/megatron_bridge and pretrain/automodel can ingest directly. | filtered_jsonl | binidx |
-| [prep/rl_prep](prep/rl_prep/) | Resolve HuggingFace dataset references in an RL data blend and shard the output JSONL into the prompt / preference layout expected by rl/nemo_rl/*. | training_jsonl | training_jsonl |
-| [prep/sft_packing](prep/sft_packing/) | Apply the chat template, tokenize training JSONL, and pack examples into Megatron-Bridge-compatible Parquet shards for SFT. | training_jsonl | packed_parquet |
+| [peft/megatron_bridge](peft/megatron_bridge/) | Parameter-efficient fine-tuning (LoRA) on top of Megatron-Bridge. Useful when a full SFT exceeds memory but you still want TP/PP/CP scaling. Consumes packed Parquet from data_prep/sft_packing. | packed_parquet, checkpoint_megatron | checkpoint_lora |
 
 ## pretrain — Pretraining
 
@@ -69,30 +69,22 @@
 | [rl/nemo_rl/dpo](rl/nemo_rl/dpo/) | Direct Preference Optimisation alignment with NeMo-RL. Consumes a preference dataset (chosen / rejected pairs) and an SFT-trained checkpoint. | training_jsonl, checkpoint_megatron | checkpoint_megatron |
 | [rl/nemo_rl/rlhf](rl/nemo_rl/rlhf/) | RLHF with a learned judge / generative reward model on top of NeMo-RL's GRPO loop. Uses NeMo-Gym for GenRM-style comparison rewards by default. | training_jsonl, checkpoint_megatron, checkpoint_hf | checkpoint_megatron |
 | [rl/nemo_rl/rlvr](rl/nemo_rl/rlvr/) | RL with Verifiable Rewards via GRPO (NeMo-RL). Designed for tasks with programmatic reward signals such as math problem solving or unit-tested code. Use config/nemo_gym.yaml for NeMo-Gym resource-server rewards. | training_jsonl, checkpoint_megatron | checkpoint_megatron |
-| [rl/nemo_rl_grpo](rl/nemo_rl_grpo/) | Planned: align an SFT-trained Megatron checkpoint with GRPO using NeMo-RL. | training_jsonl, checkpoint_megatron | checkpoint_megatron |
 
 ## sdg — Synthetic Data Generation
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
-| [sdg/data_designer](sdg/data_designer/) | Build a NeMo Data Designer pipeline declaratively and generate synthetic data. Two recipes ship in config/: 'default' produces SFT chat data, 'rl_pref' produces preference pairs (chosen / rejected) for DPO.  Customisation lives in YAML — step.py just translates declarative column specs into the upstream DataDesignerConfigBuilder API. | training_jsonl (optional) | synthetic_jsonl |
+| [sdg/data_designer](sdg/data_designer/) | Build a NeMo Data Designer pipeline declaratively and generate synthetic data. Three recipes ship in config/: 'default' produces SFT chat data, 'customer_support_tools' produces tool-call SFT data, and 'rl_pref' produces preference pairs (chosen / rejected) for DPO.  Customisation lives in YAML — step.py just translates declarative column specs into the upstream DataDesignerConfigBuilder API. | training_jsonl (optional) | synthetic_jsonl |
 
 ## sft — Supervised Fine-Tuning
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
 | [sft/automodel](sft/automodel/) | Supervised fine-tuning with the AutoModel stack for HF-format models and JSONL datasets that already use OpenAI chat-format messages. Supports full SFT and LoRA-style adapter tuning from the same step. | training_jsonl | checkpoint_hf |
-| [sft/megatron_bridge](sft/megatron_bridge/) | Supervised fine-tuning using NVIDIA Megatron-Bridge. Best for large-scale distributed training with tensor/pipeline/context parallelism. Requires packed Parquet data from prep/sft_packing. | packed_parquet, checkpoint_megatron (optional) | checkpoint_megatron |
-
-## synth — Synthetic Data Generation
-
-| Step | Description | Consumes | Produces |
-| --- | --- | --- | --- |
-| [synth/data_designer](synth/data_designer/) | Planned: generate synthetic conversation JSONL with Data Designer for downstream SFT. | training_jsonl (optional) | synthetic_jsonl |
+| [sft/megatron_bridge](sft/megatron_bridge/) | Supervised fine-tuning using NVIDIA Megatron-Bridge. Best for large-scale distributed training with tensor/pipeline/context parallelism. Requires packed Parquet data from data_prep/sft_packing. | packed_parquet, checkpoint_megatron (optional) | checkpoint_megatron |
 
 ## translate — Translation
 
 | Step | Description | Consumes | Produces |
 | --- | --- | --- | --- |
-| [translate/nemo_skills](translate/nemo_skills/) | Translate filtered JSONL into a target language with NeMo Skills and attach FAITH-based quality signals so downstream steps can keep high-faith training data. | filtered_jsonl | translated_jsonl |
-| [translate/translation](translate/translation/) | Translate JSONL or Parquet training corpora with NeMo Curator's TranslationStage, preserving structured fields and optionally attaching FAITH quality scores. | filtered_jsonl | translated_jsonl |
+| [translate/nemo_curator](translate/nemo_curator/) | Translate JSONL or Parquet training corpora with NeMo Curator's TranslationStage, preserving structured fields and optionally attaching FAITH quality scores. | filtered_jsonl | translated_jsonl |
diff --git a/src/nemotron/steps/_bootstrap/__init__.py b/src/nemotron/steps/_bootstrap/__init__.py
new file mode 100644
index 000000000..11087ed59
--- /dev/null
+++ b/src/nemotron/steps/_bootstrap/__init__.py
@@ -0,0 +1 @@
+"""Runtime bootstrap helpers for remote step execution."""
diff --git a/src/nemotron/steps/_bootstrap/curator_runtime.py b/src/nemotron/steps/_bootstrap/curator_runtime.py
new file mode 100644
index 000000000..32d890a2a
--- /dev/null
+++ b/src/nemotron/steps/_bootstrap/curator_runtime.py
@@ -0,0 +1,580 @@
+#!/usr/bin/env python3
+"""Create a pyproject-driven Curator/BYOB runtime, then exec the step.
+
+The dependency source of truth is a compact runtime manifest and requirements
+file shipped with the bootstrap source tree. Local fallback can still read
+``pyproject.toml`` directly. The bootstrap only decides which runtime profile
+to install and where to create the container-local virtual environment.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+if sys.version_info >= (3, 11):
+    import tomllib
+else:  # pragma: no cover - Python 3.10 fallback for older containers.
+    import tomli as tomllib  # type: ignore[no-redef]
+
+
+DEFAULT_CURATOR_PATH = Path(os.environ.get("NEMOTRON_CURATOR_PATH", "/opt/Curator"))
+DEFAULT_VENV_ROOT = Path(os.environ.get("NEMOTRON_CURATOR_VENV_ROOT", "/tmp/nemotron-curator-runtime"))
+DEFAULT_METADATA_ROOT = Path(
+    os.environ.get("NEMOTRON_CURATOR_METADATA_ROOT", str(DEFAULT_VENV_ROOT / "metadata"))
+)
+
+
+@dataclass(frozen=True)
+class ProjectMetadata:
+    root: Path
+    pyproject: Path | None
+    lockfile: Path | None
+    data: Mapping[str, Any]
+    manifest: Mapping[str, Any] | None = None
+
+
+@dataclass(frozen=True)
+class RuntimeSpec:
+    name: str
+    venv_name: str
+    extras: tuple[str, ...]
+    extra_index_urls: tuple[str, ...]
+    torch_backend: str | None
+    omit_packages: tuple[str, ...]
+    required_modules: tuple[str, ...]
+    spec_only_modules: tuple[str, ...]
+    pyproject_digest: str
+    requirements_file: Path | None = None
+    constraints_file: Path | None = None
+    overrides_file: Path | None = None
+
+    @property
+    def stamp(self) -> str:
+        payload = repr(
+            (
+                self.name,
+                self.venv_name,
+                self.extras,
+                self.extra_index_urls,
+                self.torch_backend,
+                self.omit_packages,
+                self.required_modules,
+                self.spec_only_modules,
+                self.pyproject_digest,
+                self.constraints_file.name if self.constraints_file else None,
+                self.overrides_file.name if self.overrides_file else None,
+                sys.version_info[:2],
+            )
+        ).encode("utf-8")
+        return hashlib.sha256(payload).hexdigest()[:16]
+
+
+def _quote(argv: Sequence[str | Path]) -> str:
+    return " ".join(shlex.quote(str(part)) for part in argv)
+
+
+def _run(argv: Sequence[str | Path], *, env: dict[str, str] | None = None) -> None:
+    print(f"[curator-runtime] $ {_quote(argv)}", flush=True)
+    subprocess.run([str(part) for part in argv], check=True, env=env)
+
+
+def _run_capture(argv: Sequence[str | Path], *, cwd: Path, env: dict[str, str] | None = None) -> str:
+    print(f"[curator-runtime] $ {_quote(argv)}", flush=True)
+    result = subprocess.run(
+        [str(part) for part in argv],
+        cwd=cwd,
+        env=env,
+        text=True,
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        message = result.stderr.strip() or result.stdout.strip()
+        raise RuntimeError(f"command failed with exit code {result.returncode}: {_quote(argv)}\n{message}")
+    if result.stderr.strip():
+        print(result.stderr, file=sys.stderr, end="" if result.stderr.endswith("\n") else "\n")
+    return result.stdout
+
+
+def _runtime_env(venv_dir: Path, curator_path: Path = DEFAULT_CURATOR_PATH) -> dict[str, str]:
+    env = os.environ.copy()
+    env["VIRTUAL_ENV"] = str(venv_dir)
+    env["UV_PROJECT_ENVIRONMENT"] = str(venv_dir)
+    env["PATH"] = f"{venv_dir / 'bin'}:{env.get('PATH', '')}"
+
+    pythonpath = [part for part in env.get("PYTHONPATH", "").split(":") if part]
+    if curator_path.exists():
+        pythonpath.insert(0, str(curator_path))
+
+    seen: set[str] = set()
+    env["PYTHONPATH"] = ":".join(part for part in pythonpath if not (part in seen or seen.add(part)))
+    return env
+
+
+def _venv_python(venv_dir: Path) -> Path:
+    return venv_dir / "bin" / "python"
+
+
+def _venv_uv(venv_dir: Path) -> Path:
+    return venv_dir / "bin" / "uv"
+
+
+def _ensure_venv(venv_dir: Path, *, recreate: bool) -> Path:
+    python = _venv_python(venv_dir)
+    if recreate and venv_dir.exists():
+        shutil.rmtree(venv_dir)
+    if not python.exists():
+        venv_dir.parent.mkdir(parents=True, exist_ok=True)
+        _run([sys.executable, "-m", "venv", "--system-site-packages", str(venv_dir)])
+    return python
+
+
+def _ensure_uv(venv_python: Path, venv_dir: Path, env: dict[str, str]) -> Path:
+    uv = _venv_uv(venv_dir)
+    if uv.exists():
+        return uv
+    _run([venv_python, "-m", "pip", "install", "--quiet", "uv"], env=env)
+    if uv.exists():
+        return uv
+    found = shutil.which("uv", path=env.get("PATH"))
+    if found:
+        return Path(found)
+    raise RuntimeError("uv was installed but no uv executable was found")
+
+
+def _candidate_metadata_dirs(project_metadata: Path | None = None) -> list[Path]:
+    candidates: list[Path] = []
+    if project_metadata is not None:
+        candidates.append(project_metadata)
+    env_metadata = os.environ.get("NEMOTRON_PROJECT_METADATA_DIR")
+    if env_metadata:
+        candidates.append(Path(env_metadata))
+
+    for start in (Path.cwd(), Path(__file__).resolve()):
+        candidates.extend(start.parents)
+    return candidates
+
+
+def _find_env_project_metadata() -> ProjectMetadata | None:
+    from nemotron.steps._bootstrap.runtime_payloads import (
+        RUNTIME_PAYLOAD_SHA256_ENV,
+        write_runtime_payloads_from_env,
+    )
+
+    digest = os.environ.get(RUNTIME_PAYLOAD_SHA256_ENV)
+    if not digest:
+        return None
+    if not re.fullmatch(r"[0-9a-fA-F]{64}", digest):
+        raise RuntimeError(f"Invalid Curator runtime payload digest: {digest!r}")
+
+    root = DEFAULT_METADATA_ROOT / digest[:16]
+    if not write_runtime_payloads_from_env(root, os.environ):
+        return None
+    manifest = root / "runtime.json"
+    if not manifest.exists():
+        raise FileNotFoundError("Curator runtime payload is missing runtime.json")
+    data = json.loads(manifest.read_text(encoding="utf-8"))
+    return ProjectMetadata(root=root, pyproject=None, lockfile=None, data={}, manifest=data)
+
+
+def _find_project_metadata(project_metadata: Path | None = None) -> ProjectMetadata:
+    env_metadata = _find_env_project_metadata()
+    if env_metadata is not None:
+        return env_metadata
+
+    for candidate in _candidate_metadata_dirs(project_metadata):
+        for root in (candidate, candidate / ".nemotron_runtime", candidate / "runtime"):
+            manifest = root / "runtime.json"
+            if manifest.exists():
+                data = json.loads(manifest.read_text(encoding="utf-8"))
+                return ProjectMetadata(
+                    root=root,
+                    pyproject=None,
+                    lockfile=None,
+                    data={},
+                    manifest=data,
+                )
+            pyproject = root / "pyproject.toml"
+            if pyproject.exists():
+                data = tomllib.loads(pyproject.read_text(encoding="utf-8"))
+                lockfile = root / "uv.lock"
+                return ProjectMetadata(
+                    root=root,
+                    pyproject=pyproject,
+                    lockfile=lockfile if lockfile.exists() else None,
+                    data=data,
+                )
+    raise FileNotFoundError(
+        "Could not find .nemotron_runtime/runtime.json or pyproject.toml. "
+        "Ensure the source packager ships runtime metadata."
+    )
+
+
+def _runtime_config(data: Mapping[str, Any], runtime_name: str) -> Mapping[str, Any]:
+    runtime_table = data.get("tool", {}).get("nemotron", {}).get("runtime", {})
+    if not isinstance(runtime_table, Mapping):
+        return {}
+    config = runtime_table.get(runtime_name, {})
+    if not isinstance(config, Mapping):
+        return {}
+    parent_name = config.get("extends")
+    if parent_name and isinstance(parent_name, str):
+        parent = dict(_runtime_config(data, parent_name))
+        parent.update(config)
+        return parent
+    return config
+
+
+def _as_tuple(value: Any, *, default: tuple[str, ...] = ()) -> tuple[str, ...]:
+    if value is None:
+        return default
+    if isinstance(value, str):
+        return (value,)
+    if isinstance(value, Sequence):
+        return tuple(str(item) for item in value)
+    raise TypeError(f"Expected string or list of strings, got {type(value).__name__}")
+
+
+def _manifest_path(root: Path, config: Mapping[str, Any], key: str) -> Path | None:
+    value = config.get(key)
+    if value is None:
+        return None
+    if not isinstance(value, str) or not value:
+        raise ValueError(f"Runtime profile field {key!r} must be a file name")
+    return root / value
+
+
+def load_runtime_spec(
+    runtime_name: str,
+    metadata: ProjectMetadata,
+    *,
+    cli_extras: Sequence[str] = (),
+) -> RuntimeSpec:
+    resolved_name = runtime_name
+    if metadata.manifest is not None:
+        profiles = metadata.manifest.get("profiles", {})
+        if not isinstance(profiles, Mapping) or resolved_name not in profiles:
+            raise ValueError(f"Runtime profile {resolved_name!r} is not defined in runtime manifest")
+        config = profiles[resolved_name]
+        if not isinstance(config, Mapping):
+            raise ValueError(f"Runtime profile {resolved_name!r} must be a mapping")
+        requirements = config.get("requirements")
+        if not isinstance(requirements, str) or not requirements:
+            raise ValueError(f"Runtime profile {resolved_name!r} is missing a requirements file")
+        return RuntimeSpec(
+            name=str(config.get("name") or resolved_name),
+            venv_name=str(config.get("venv_name") or resolved_name),
+            extras=tuple(cli_extras) or _as_tuple(config.get("extras"), default=(resolved_name,)),
+            extra_index_urls=_as_tuple(config.get("extra_index_urls")),
+            torch_backend=str(config["torch_backend"]) if config.get("torch_backend") else None,
+            omit_packages=(),
+            required_modules=_as_tuple(config.get("required_modules")),
+            spec_only_modules=_as_tuple(config.get("spec_only_modules")),
+            pyproject_digest=str(config.get("digest") or "runtime-manifest"),
+            requirements_file=metadata.root / requirements,
+            constraints_file=_manifest_path(metadata.root, config, "constraints"),
+            overrides_file=_manifest_path(metadata.root, config, "overrides"),
+        )
+
+    if metadata.pyproject is None:
+        raise ValueError("Runtime metadata is missing both manifest and pyproject data")
+    config = _runtime_config(metadata.data, resolved_name)
+    if not config:
+        raise ValueError(f"Runtime profile {resolved_name!r} is not defined in pyproject.toml")
+    extras = tuple(cli_extras) or _as_tuple(config.get("extras"), default=(resolved_name,))
+    digest = hashlib.sha256(metadata.pyproject.read_bytes()).hexdigest()[:16]
+    if metadata.lockfile:
+        digest = hashlib.sha256((digest + metadata.lockfile.read_text(encoding="utf-8")).encode()).hexdigest()[:16]
+    return RuntimeSpec(
+        name=resolved_name,
+        venv_name=str(config.get("venv-name") or resolved_name),
+        extras=extras,
+        extra_index_urls=_as_tuple(config.get("extra-index-urls")),
+        torch_backend=str(config["torch-backend"]) if config.get("torch-backend") else None,
+        omit_packages=tuple(_normalize_package_name(name) for name in _as_tuple(config.get("omit-packages"))),
+        required_modules=_as_tuple(config.get("required-imports")),
+        spec_only_modules=_as_tuple(config.get("spec-only-imports")),
+        pyproject_digest=digest,
+    )
+
+
+def _normalize_package_name(name: str) -> str:
+    return name.replace("_", "-").replace(".", "-").lower()
+
+
+def _requirement_name(requirement: str) -> str:
+    match = re.match(r"\s*([A-Za-z0-9_.-]+)", requirement)
+    if not match:
+        return ""
+    return _normalize_package_name(match.group(1))
+
+
+def _write_list(path: Path, values: Sequence[str]) -> Path | None:
+    if not values:
+        return None
+    path.write_text("\n".join(values) + "\n", encoding="utf-8")
+    return path
+
+
+def _filter_requirement_text(requirements: str, omit_packages: Sequence[str]) -> str:
+    omit = set(omit_packages)
+    lines: list[str] = []
+    skip_continuation = False
+    for line in requirements.splitlines():
+        stripped = line.strip()
+        if skip_continuation:
+            skip_continuation = line.rstrip().endswith("\\")
+            continue
+        if stripped and not stripped.startswith(("#", "-")) and _requirement_name(stripped) in omit:
+            skip_continuation = line.rstrip().endswith("\\")
+            continue
+        lines.append(line)
+    return "\n".join(lines).strip() + "\n"
+
+
+def _build_locked_requirement_files(
+    uv: Path,
+    metadata: ProjectMetadata,
+    spec: RuntimeSpec,
+    work_dir: Path,
+    env: dict[str, str],
+) -> dict[str, Path | None] | None:
+    if metadata.lockfile is None:
+        return None
+
+    command: list[str | Path] = [
+        uv,
+        "export",
+        "--frozen",
+        "--no-dev",
+        "--no-emit-project",
+        "--no-annotate",
+        "--no-hashes",
+        "--no-header",
+        "--format",
+        "requirements.txt",
+    ]
+    for extra in spec.extras:
+        command.extend(["--extra", extra])
+
+    requirements = _filter_requirement_text(_run_capture(command, cwd=metadata.root, env=env), spec.omit_packages)
+    if not requirements.strip():
+        raise RuntimeError(f"Runtime {spec.name!r} exported no requirements from extras {spec.extras!r}")
+
+    path = work_dir / "requirements.locked.txt"
+    path.write_text(requirements, encoding="utf-8")
+    return {"requirements": path, "constraints": None, "overrides": None}
+
+
+def _build_direct_requirement_files(
+    metadata: ProjectMetadata,
+    spec: RuntimeSpec,
+    work_dir: Path,
+) -> dict[str, Path | None]:
+    optional_deps = metadata.data.get("project", {}).get("optional-dependencies", {})
+    if not isinstance(optional_deps, Mapping):
+        raise ValueError("pyproject.toml is missing [project.optional-dependencies]")
+
+    requirements: list[str] = []
+    for extra in spec.extras:
+        deps = optional_deps.get(extra)
+        if deps is None:
+            raise ValueError(f"Runtime extra {extra!r} is not defined in pyproject.toml")
+        for dep in deps:
+            dep_text = str(dep)
+            if _requirement_name(dep_text) not in spec.omit_packages:
+                requirements.append(dep_text)
+
+    if not requirements:
+        raise ValueError(f"Runtime {spec.name!r} produced no requirements from extras {spec.extras!r}")
+
+    tool_uv = metadata.data.get("tool", {}).get("uv", {})
+    if not isinstance(tool_uv, Mapping):
+        tool_uv = {}
+
+    return {
+        "requirements": _write_list(work_dir / "requirements.in", requirements),
+        "constraints": _write_list(
+            work_dir / "constraints.txt",
+            tuple(str(item) for item in tool_uv.get("constraint-dependencies", []) or []),
+        ),
+        "overrides": _write_list(
+            work_dir / "overrides.txt",
+            tuple(str(item) for item in tool_uv.get("override-dependencies", []) or []),
+        ),
+    }
+
+
+def _build_requirement_files(
+    metadata: ProjectMetadata,
+    spec: RuntimeSpec,
+    work_dir: Path,
+    *,
+    uv: Path | None = None,
+    env: dict[str, str] | None = None,
+) -> dict[str, Path | None]:
+    if spec.requirements_file is not None:
+        files = {
+            "requirements": spec.requirements_file,
+            "constraints": spec.constraints_file,
+            "overrides": spec.overrides_file,
+        }
+        for file_type, path in files.items():
+            if path is not None and not path.exists():
+                raise FileNotFoundError(f"Runtime {file_type} file not found: {path}")
+        return files
+    if uv is not None and env is not None:
+        locked = _build_locked_requirement_files(uv, metadata, spec, work_dir, env)
+        if locked is not None:
+            return locked
+    return _build_direct_requirement_files(metadata, spec, work_dir)
+
+
+def _stamp_path(venv_dir: Path) -> Path:
+    return venv_dir / ".nemotron-curator-runtime.stamp"
+
+
+def _stamp_matches(venv_dir: Path, spec: RuntimeSpec) -> bool:
+    stamp = _stamp_path(venv_dir)
+    return stamp.exists() and stamp.read_text(encoding="utf-8").strip() == spec.stamp
+
+
+def _write_stamp(venv_dir: Path, spec: RuntimeSpec) -> None:
+    _stamp_path(venv_dir).write_text(f"{spec.stamp}\n", encoding="utf-8")
+
+
+def _verify_profile(venv_python: Path, spec: RuntimeSpec, env: dict[str, str]) -> bool:
+    if not spec.required_modules and not spec.spec_only_modules:
+        return True
+    code = """
+import importlib.util
+import sys
+
+required = sys.argv[1].split(",") if sys.argv[1] else []
+spec_only = sys.argv[2].split(",") if sys.argv[2] else []
+missing = [name for name in required + spec_only if importlib.util.find_spec(name) is None]
+if missing:
+    print("missing modules: " + ", ".join(missing), file=sys.stderr)
+    raise SystemExit(1)
+"""
+    result = subprocess.run(
+        [str(venv_python), "-c", code, ",".join(spec.required_modules), ",".join(spec.spec_only_modules)],
+        env=env,
+    )
+    return result.returncode == 0
+
+
+def ensure_runtime(
+    spec: RuntimeSpec,
+    *,
+    metadata: ProjectMetadata,
+    venv_root: Path = DEFAULT_VENV_ROOT,
+    curator_path: Path = DEFAULT_CURATOR_PATH,
+    recreate: bool = False,
+    skip_install: bool = False,
+) -> tuple[Path, dict[str, str]]:
+    """Ensure a runtime exists and return ``(python, env)`` for exec."""
+    venv_dir = venv_root / spec.venv_name
+    venv_python = _ensure_venv(venv_dir, recreate=recreate)
+    env = _runtime_env(venv_dir, curator_path)
+
+    if _stamp_matches(venv_dir, spec) and _verify_profile(venv_python, spec, env):
+        print(f"[curator-runtime] reusing {spec.name} runtime at {venv_dir}", flush=True)
+        return venv_python, env
+
+    if skip_install:
+        if _verify_profile(venv_python, spec, env):
+            return venv_python, env
+        raise RuntimeError(f"{spec.name} runtime is missing packages and --skip-install was set")
+
+    uv = _ensure_uv(venv_python, venv_dir, env)
+    with tempfile.TemporaryDirectory(prefix="nemotron-runtime-") as td:
+        requirement_files = _build_requirement_files(metadata, spec, Path(td), uv=uv, env=env)
+        command: list[str | Path] = [
+            uv,
+            "pip",
+            "install",
+            "--python",
+            venv_python,
+            "--quiet",
+            "--no-cache",
+            "--requirements",
+            requirement_files["requirements"],
+        ]
+        if requirement_files["constraints"]:
+            command.extend(["--constraints", requirement_files["constraints"]])
+        if requirement_files["overrides"]:
+            command.extend(["--overrides", requirement_files["overrides"]])
+        for index_url in spec.extra_index_urls:
+            command.extend(["--extra-index-url", index_url])
+        if spec.torch_backend:
+            command.extend(["--torch-backend", spec.torch_backend])
+        _run(command, env=env)
+
+    if not _verify_profile(venv_python, spec, env):
+        raise RuntimeError(f"{spec.name} runtime installation completed but import checks failed")
+    _write_stamp(venv_dir, spec)
+    return venv_python, env
+
+
+def _normalize_command(command: Sequence[str], venv_python: Path) -> list[str]:
+    if not command:
+        raise ValueError("missing command after '--'")
+    argv = list(command)
+    if argv[0] == "--":
+        argv = argv[1:]
+    if not argv:
+        raise ValueError("missing command after '--'")
+    if Path(argv[0]).name in {"python", "python3"}:
+        argv[0] = str(venv_python)
+    return argv
+
+
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--profile", default="byob", help="Runtime profile under [tool.nemotron.runtime].")
+    parser.add_argument("--extra", action="append", default=[], help="Override runtime extras from pyproject.toml.")
+    parser.add_argument("--project-metadata", type=Path, default=None, help="Directory containing runtime metadata.")
+    parser.add_argument("--venv-root", type=Path, default=DEFAULT_VENV_ROOT)
+    parser.add_argument("--curator-path", type=Path, default=DEFAULT_CURATOR_PATH)
+    parser.add_argument("--recreate", action="store_true", help="Recreate the runtime venv before installing.")
+    parser.add_argument(
+        "--skip-install",
+        action="store_true",
+        help="Use the venv/container as-is and fail if required modules are missing.",
+    )
+    parser.add_argument("command", nargs=argparse.REMAINDER, help="Command to exec after '--'.")
+    return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> None:
+    args = parse_args(argv)
+    metadata = _find_project_metadata(args.project_metadata)
+    spec = load_runtime_spec(args.profile, metadata, cli_extras=args.extra)
+    venv_python, env = ensure_runtime(
+        spec,
+        metadata=metadata,
+        venv_root=args.venv_root,
+        curator_path=args.curator_path,
+        recreate=args.recreate,
+        skip_install=args.skip_install,
+    )
+    command = _normalize_command(args.command, venv_python)
+    print(f"[curator-runtime] exec {_quote(command)}", flush=True)
+    os.execvpe(command[0], command, env)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/nemotron/steps/_bootstrap/runtime_payloads.py b/src/nemotron/steps/_bootstrap/runtime_payloads.py
new file mode 100644
index 000000000..e07dc656b
--- /dev/null
+++ b/src/nemotron/steps/_bootstrap/runtime_payloads.py
@@ -0,0 +1,362 @@
+"""Build compact runtime metadata for remote bootstrap execution."""
+
+from __future__ import annotations
+
+import base64
+import binascii
+import hashlib
+import json
+import re
+import shutil
+import subprocess
+import sys
+from collections.abc import Mapping
+from pathlib import Path
+from typing import Any
+
+if sys.version_info >= (3, 11):
+    import tomllib
+else:  # pragma: no cover - Python 3.10 fallback.
+    import tomli as tomllib  # type: ignore[no-redef]
+
+
+DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parent / "runtime"
+RUNTIME_PAYLOAD_CHUNKS_ENV = "NEMOTRON_CURATOR_RUNTIME_CHUNKS"
+RUNTIME_PAYLOAD_SHA256_ENV = "NEMOTRON_CURATOR_RUNTIME_SHA256"
+RUNTIME_PAYLOAD_CHUNK_PREFIX = "NEMOTRON_CURATOR_RUNTIME_CHUNK_"
+_ENV_CHUNK_SIZE = 9_000
+
+
+def _as_str_list(value: Any) -> list[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [value]
+    if isinstance(value, list):
+        return [str(item) for item in value]
+    return []
+
+
+def _normalize_package_name(name: str) -> str:
+    return name.replace("_", "-").replace(".", "-").lower()
+
+
+def _requirement_name(requirement: str) -> str:
+    match = re.match(r"\s*([A-Za-z0-9_.-]+)", requirement)
+    return _normalize_package_name(match.group(1)) if match else ""
+
+
+def _filter_requirement_text(requirements: str, omit_packages: list[str]) -> str:
+    omit = set(omit_packages)
+    lines: list[str] = []
+    skip_continuation = False
+    for line in requirements.splitlines():
+        stripped = line.strip()
+        if skip_continuation:
+            skip_continuation = line.rstrip().endswith("\\")
+            continue
+        if stripped and not stripped.startswith(("#", "-")) and _requirement_name(stripped) in omit:
+            skip_continuation = line.rstrip().endswith("\\")
+            continue
+        lines.append(line)
+    return "\n".join(lines).strip() + "\n"
+
+
+def _direct_extra_requirements(data: dict[str, Any], extras: list[str], omit_packages: list[str]) -> str:
+    optional_deps = data.get("project", {}).get("optional-dependencies", {})
+    requirements: list[str] = []
+    for extra in extras:
+        deps = optional_deps.get(extra)
+        if deps is None:
+            raise ValueError(f"Runtime extra {extra!r} is not defined in pyproject.toml")
+        if not isinstance(deps, list):
+            raise ValueError(f"Runtime extra {extra!r} must be a list in pyproject.toml")
+        for dep in deps:
+            dep_text = str(dep)
+            if _requirement_name(dep_text) not in omit_packages:
+                requirements.append(dep_text)
+    return "\n".join(requirements).strip() + "\n"
+
+
+def _runtime_dependency_list(data: dict[str, Any], config: dict[str, Any], key: str) -> list[str]:
+    explicit = _as_str_list(config.get(key))
+    if explicit:
+        return explicit
+    tool_uv = data.get("tool", {}).get("uv", {})
+    if not isinstance(tool_uv, dict):
+        return []
+    return _as_str_list(tool_uv.get(key))
+
+
+def _add_payload(
+    payloads: list[tuple[str, bytes]],
+    payload_names_by_content: dict[bytes, str],
+    name: str,
+    content: bytes,
+) -> str:
+    existing = payload_names_by_content.get(content)
+    if existing is not None:
+        return existing
+    payloads.append((name, content))
+    payload_names_by_content[content] = name
+    return name
+
+
+def _add_text_payload(
+    payloads: list[tuple[str, bytes]],
+    payload_names_by_content: dict[bytes, str],
+    name: str,
+    values: list[str],
+) -> str | None:
+    if not values:
+        return None
+    return _add_payload(payloads, payload_names_by_content, name, ("\n".join(values) + "\n").encode("utf-8"))
+
+
+def _validate_payload_name(name: str) -> None:
+    path = Path(name)
+    if path.is_absolute() or len(path.parts) != 1 or name in {"", ".", ".."}:
+        raise ValueError(f"Runtime payload name must be a single file name: {name!r}")
+
+
+def encode_runtime_payload_env(
+    payloads: list[tuple[str, bytes]],
+    *,
+    chunk_size: int = _ENV_CHUNK_SIZE,
+) -> dict[str, str]:
+    """Encode generated runtime payload files into chunked env vars."""
+    files = []
+    for name, content in payloads:
+        _validate_payload_name(name)
+        files.append({"name": name, "content": base64.b64encode(content).decode("ascii")})
+
+    raw = json.dumps({"version": 1, "files": files}, sort_keys=True).encode("utf-8")
+    digest = hashlib.sha256(raw).hexdigest()
+    encoded = base64.b64encode(raw).decode("ascii")
+    chunks = [encoded[i : i + chunk_size] for i in range(0, len(encoded), chunk_size)]
+    env = {
+        RUNTIME_PAYLOAD_CHUNKS_ENV: str(len(chunks)),
+        RUNTIME_PAYLOAD_SHA256_ENV: digest,
+    }
+    for idx, chunk in enumerate(chunks):
+        env[f"{RUNTIME_PAYLOAD_CHUNK_PREFIX}{idx}"] = chunk
+    return env
+
+
+def _payload_digest_context(env: Mapping[str, str]) -> str:
+    digest = env.get(RUNTIME_PAYLOAD_SHA256_ENV)
+    return f" sha256={digest}" if digest else ""
+
+
+def decode_runtime_payload_env(env: Mapping[str, str]) -> list[tuple[str, bytes]]:
+    """Decode runtime payload files from env vars, returning an empty list if absent."""
+    chunks_value = env.get(RUNTIME_PAYLOAD_CHUNKS_ENV)
+    if not chunks_value:
+        if env.get(RUNTIME_PAYLOAD_SHA256_ENV):
+            raise RuntimeError(
+                "Curator runtime payload is incomplete: "
+                f"{RUNTIME_PAYLOAD_CHUNKS_ENV} is missing. "
+                "Check whether the executor filters NEMOTRON_CURATOR_RUNTIME_* environment variables."
+            )
+        return []
+
+    digest_context = _payload_digest_context(env)
+    if not env.get(RUNTIME_PAYLOAD_SHA256_ENV):
+        raise RuntimeError(
+            "Curator runtime payload is incomplete: "
+            f"{RUNTIME_PAYLOAD_SHA256_ENV} is missing. "
+            "Check whether the executor filters NEMOTRON_CURATOR_RUNTIME_* environment variables."
+        )
+    try:
+        count = int(chunks_value)
+    except ValueError as exc:
+        raise RuntimeError(f"Invalid Curator runtime payload chunk count {chunks_value!r}.{digest_context}") from exc
+    if count < 1:
+        raise RuntimeError(f"Invalid Curator runtime payload chunk count {count}.{digest_context}")
+
+    missing = [idx for idx in range(count) if f"{RUNTIME_PAYLOAD_CHUNK_PREFIX}{idx}" not in env]
+    if missing:
+        missing_text = ", ".join(str(idx) for idx in missing)
+        raise RuntimeError(
+            "Curator runtime payload is incomplete: "
+            f"missing chunk index(es) {missing_text} of {count}.{digest_context} "
+            "Check whether the executor filters NEMOTRON_CURATOR_RUNTIME_* environment variables."
+        )
+
+    encoded = "".join(env[f"{RUNTIME_PAYLOAD_CHUNK_PREFIX}{idx}"] for idx in range(count))
+    try:
+        raw = base64.b64decode(encoded, validate=True)
+    except binascii.Error as exc:
+        raise RuntimeError(f"Curator runtime payload is not valid base64.{digest_context}") from exc
+    expected = env.get(RUNTIME_PAYLOAD_SHA256_ENV)
+    if expected and hashlib.sha256(raw).hexdigest() != expected:
+        raise RuntimeError("Curator runtime payload digest mismatch")
+
+    payload = json.loads(raw.decode("utf-8"))
+    if payload.get("version") != 1:
+        raise ValueError("Unsupported Curator runtime payload version")
+
+    decoded: list[tuple[str, bytes]] = []
+    for item in payload.get("files", []):
+        name = str(item["name"])
+        _validate_payload_name(name)
+        decoded.append((name, base64.b64decode(item["content"])))
+    return decoded
+
+
+def write_runtime_payloads_from_env(output_dir: Path, env: Mapping[str, str]) -> bool:
+    """Write env-encoded runtime payloads into ``output_dir`` if present."""
+    payloads = decode_runtime_payload_env(env)
+    if not payloads:
+        return False
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for name, content in payloads:
+        (output_dir / name).write_bytes(content)
+    return True
+
+
+def read_runtime_payloads(runtime_dir: Path = DEFAULT_OUTPUT_DIR) -> list[tuple[str, bytes]]:
+    """Read a packaged runtime payload directory if one is present."""
+    manifest = runtime_dir / "runtime.json"
+    if not manifest.is_file():
+        return []
+
+    data = json.loads(manifest.read_text(encoding="utf-8"))
+    names = {"runtime.json"}
+    profiles = data.get("profiles", {})
+    if not isinstance(profiles, dict):
+        raise ValueError(f"Runtime manifest has invalid profiles table: {manifest}")
+    for profile in profiles.values():
+        if not isinstance(profile, dict):
+            continue
+        for key in ("requirements", "constraints", "overrides"):
+            name = profile.get(key)
+            if name:
+                _validate_payload_name(str(name))
+                names.add(str(name))
+
+    payloads: list[tuple[str, bytes]] = []
+    for name in sorted(names):
+        _validate_payload_name(name)
+        path = runtime_dir / name
+        if not path.is_file():
+            raise FileNotFoundError(f"Runtime manifest references missing payload file: {path}")
+        payloads.append((name, path.read_bytes()))
+    return payloads
+
+
+def _export_extra_requirements(root: Path, data: dict[str, Any], extras: list[str], omit_packages: list[str]) -> str:
+    uv = shutil.which("uv")
+    if not uv or not (root / "uv.lock").exists():
+        return _direct_extra_requirements(data, extras, omit_packages)
+
+    command = [
+        uv,
+        "export",
+        "--frozen",
+        "--no-dev",
+        "--no-emit-project",
+        "--no-annotate",
+        "--no-hashes",
+        "--no-header",
+        "--format",
+        "requirements.txt",
+    ]
+    for extra in extras:
+        command.extend(["--extra", extra])
+    result = subprocess.run(command, cwd=root, text=True, capture_output=True)
+    if result.returncode != 0:
+        message = result.stderr.strip() or result.stdout.strip()
+        raise RuntimeError(
+            "uv export failed while building Curator runtime metadata. "
+            f"Command: {' '.join(command)}\n{message}"
+        )
+    return _filter_requirement_text(result.stdout, omit_packages)
+
+
+def build_runtime_payloads(root: Path) -> list[tuple[str, bytes]]:
+    """Return runtime payload files for profiles declared in ``pyproject.toml``."""
+    pyproject = root / "pyproject.toml"
+    if not pyproject.exists():
+        return []
+    data = tomllib.loads(pyproject.read_text(encoding="utf-8"))
+    runtime_table = data.get("tool", {}).get("nemotron", {}).get("runtime", {})
+    if not isinstance(runtime_table, dict):
+        return []
+
+    manifest: dict[str, Any] = {"version": 1, "profiles": {}}
+    payloads: list[tuple[str, bytes]] = []
+    payload_names_by_content: dict[bytes, str] = {}
+    for name, config in sorted(runtime_table.items()):
+        if not isinstance(config, dict):
+            continue
+        extras = _as_str_list(config.get("extras")) or [name]
+        omit_packages = [_normalize_package_name(item) for item in _as_str_list(config.get("omit-packages"))]
+        requirements = _export_extra_requirements(root, data, extras, omit_packages)
+        if not requirements.strip():
+            continue
+        requirements_name = f"{name}.requirements.txt"
+        constraints_name = _add_text_payload(
+            payloads,
+            payload_names_by_content,
+            f"{name}.constraints.txt",
+            _runtime_dependency_list(data, config, "constraint-dependencies"),
+        )
+        overrides_name = _add_text_payload(
+            payloads,
+            payload_names_by_content,
+            f"{name}.overrides.txt",
+            _runtime_dependency_list(data, config, "override-dependencies"),
+        )
+        digest = hashlib.sha256()
+        digest.update(pyproject.read_bytes())
+        lockfile = root / "uv.lock"
+        if lockfile.exists():
+            digest.update(lockfile.read_bytes())
+        digest.update(requirements.encode("utf-8"))
+        requirements_name = _add_payload(
+            payloads,
+            payload_names_by_content,
+            requirements_name,
+            requirements.encode("utf-8"),
+        )
+        manifest["profiles"][name] = {
+            "name": name,
+            "venv_name": str(config.get("venv-name") or name),
+            "extras": extras,
+            "requirements": requirements_name,
+            "extra_index_urls": _as_str_list(config.get("extra-index-urls")),
+            "torch_backend": config.get("torch-backend"),
+            "required_modules": _as_str_list(config.get("required-imports")),
+            "spec_only_modules": _as_str_list(config.get("spec-only-imports")),
+            "digest": digest.hexdigest()[:16],
+        }
+        if constraints_name:
+            manifest["profiles"][name]["constraints"] = constraints_name
+        if overrides_name:
+            manifest["profiles"][name]["overrides"] = overrides_name
+
+    if not manifest["profiles"]:
+        return []
+    manifest_bytes = json.dumps(manifest, indent=2, sort_keys=True).encode("utf-8")
+    return [("runtime.json", manifest_bytes), *payloads]
+
+
+def write_runtime_payloads(root: Path, output_dir: Path = DEFAULT_OUTPUT_DIR) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for name, content in build_runtime_payloads(root):
+        (output_dir / name).write_bytes(content)
+
+
+def main() -> None:
+    import argparse
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--repo-root", type=Path, default=Path.cwd())
+    parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
+    args = parser.parse_args()
+    write_runtime_payloads(args.repo_root, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/nemotron/steps/_runners/convert.py b/src/nemotron/steps/_runners/convert.py
new file mode 100644
index 000000000..eda626380
--- /dev/null
+++ b/src/nemotron/steps/_runners/convert.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared runners for checkpoint conversion steps."""
+
+from __future__ import annotations
+
+import inspect
+import os
+import re
+import shlex
+import subprocess
+import sys
+from collections.abc import Callable, Mapping
+from pathlib import Path
+from typing import Any
+
+from omegaconf import OmegaConf
+
+from nemotron.kit.train_script import apply_hydra_overrides, load_omegaconf_yaml, parse_config_and_overrides
+
+_PEFT_ADAPTER_CONFIG = "adapter_config.json"
+
+
+def load_convert_config(default_config: Path) -> dict[str, Any]:
+    """Load the step YAML and CLI overrides as a plain resolved mapping."""
+    config_path, cli_overrides = parse_config_and_overrides(default_config=default_config)
+    raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides)
+    cfg = OmegaConf.to_container(raw, resolve=True)
+    if not isinstance(cfg, dict):
+        raise TypeError(f"{config_path} must contain a YAML mapping")
+    return cfg
+
+
+def run_hf_to_megatron(default_config: Path) -> None:
+    cfg = load_convert_config(default_config)
+    import_hf_to_megatron(cfg)
+
+
+def run_megatron_to_hf(default_config: Path) -> None:
+    cfg = load_convert_config(default_config)
+    export_megatron_to_hf(
+        megatron_path=_required_str(cfg, "megatron_path"),
+        hf_model_id=_required_str(cfg, "hf_model_id"),
+        hf_path=_required_str(cfg, "hf_path"),
+        trust_remote_code=_as_bool(cfg.get("trust_remote_code", True)),
+        show_progress=_as_bool(cfg.get("show_progress", True)),
+        strict=_as_bool(cfg.get("strict", True)),
+    )
+
+
+def run_merge_lora(default_config: Path) -> None:
+    cfg = load_convert_config(default_config)
+    backend = _resolve_merge_backend(cfg)
+
+    if backend == "hf_peft":
+        merge_hf_peft_lora(cfg)
+        return
+
+    if backend == "megatron_bridge":
+        merge_megatron_bridge_lora(cfg)
+        return
+
+    raise ValueError("merge_lora backend must be one of: auto, hf_peft, megatron_bridge")
+
+
+def import_hf_to_megatron(cfg: Mapping[str, Any]) -> None:
+    dtype = cfg.get("torch_dtype") or cfg.get("dtype")
+    kwargs = {
+        "hf_model_id": _required_str(cfg, "hf_model_id"),
+        "megatron_path": _required_str(cfg, "megatron_path"),
+    }
+    if dtype:
+        kwargs["torch_dtype"] = _torch_dtype(str(dtype))
+    if cfg.get("device_map"):
+        kwargs["device_map"] = str(cfg["device_map"])
+    if cfg.get("trust_remote_code") is not None:
+        kwargs["trust_remote_code"] = _as_bool(cfg["trust_remote_code"])
+
+    from megatron.bridge import AutoBridge
+
+    print(f"Importing HF checkpoint {kwargs['hf_model_id']} -> {kwargs['megatron_path']}", flush=True)
+    _call_with_supported_kwargs(AutoBridge.import_ckpt, **kwargs)
+
+
+def export_megatron_to_hf(
+    *,
+    megatron_path: str,
+    hf_model_id: str,
+    hf_path: str,
+    trust_remote_code: bool = True,
+    show_progress: bool = True,
+    strict: bool = True,
+) -> None:
+    from megatron.bridge import AutoBridge
+
+    print(f"Exporting Megatron checkpoint {megatron_path} -> {hf_path}", flush=True)
+    bridge = _autobridge_for_hf_export(
+        AutoBridge,
+        megatron_path=megatron_path,
+        hf_model_id=hf_model_id,
+        trust_remote_code=trust_remote_code,
+    )
+    _call_with_supported_kwargs(
+        bridge.export_ckpt,
+        megatron_path=megatron_path,
+        hf_path=hf_path,
+        show_progress=show_progress,
+        strict=strict,
+    )
+
+
+def merge_hf_peft_lora(cfg: Mapping[str, Any]) -> None:
+    """Merge a HuggingFace PEFT adapter into its HF base and save a standalone checkpoint."""
+    try:
+        from peft import PeftModel
+    except ImportError as exc:
+        raise ImportError(
+            "convert/merge_lora backend=hf_peft requires the optional 'peft' package. "
+            "Install PEFT in the runtime image or use backend=megatron_bridge for Megatron-Bridge adapters."
+        ) from exc
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    base_hf_path = _required_str(cfg, "base_hf_path")
+    requested_lora_checkpoint = _required_str(cfg, "lora_checkpoint")
+    lora_checkpoint = _resolve_hf_peft_adapter_path(requested_lora_checkpoint)
+    output_hf_path = Path(_required_str(cfg, "output_hf_path"))
+
+    model_kwargs: dict[str, Any] = {
+        "trust_remote_code": _as_bool(cfg.get("trust_remote_code", True)),
+        "low_cpu_mem_usage": _as_bool(cfg.get("low_cpu_mem_usage", True)),
+    }
+    if cfg.get("device_map"):
+        model_kwargs["device_map"] = str(cfg["device_map"])
+    if cfg.get("torch_dtype"):
+        dtype = str(cfg["torch_dtype"])
+        model_kwargs["torch_dtype"] = dtype if dtype == "auto" else _torch_dtype(dtype)
+
+    if lora_checkpoint != requested_lora_checkpoint:
+        print(f"Resolved HF PEFT adapter {requested_lora_checkpoint} -> {lora_checkpoint}", flush=True)
+    print(f"Merging HF PEFT adapter {lora_checkpoint} into {base_hf_path}", flush=True)
+    model = AutoModelForCausalLM.from_pretrained(base_hf_path, **model_kwargs)
+    merged_model = PeftModel.from_pretrained(model, lora_checkpoint).merge_and_unload()
+    output_hf_path.mkdir(parents=True, exist_ok=True)
+    merged_model.save_pretrained(output_hf_path, safe_serialization=_as_bool(cfg.get("safe_serialization", True)))
+
+    if _as_bool(cfg.get("save_tokenizer", True)):
+        tokenizer = AutoTokenizer.from_pretrained(base_hf_path, trust_remote_code=model_kwargs["trust_remote_code"])
+        tokenizer.save_pretrained(output_hf_path)
+    print(f"Merged HF checkpoint written to {output_hf_path}", flush=True)
+
+
+def merge_megatron_bridge_lora(cfg: Mapping[str, Any]) -> None:
+    """Merge a Megatron-Bridge adapter, then optionally export the merged checkpoint to HF."""
+    merged_megatron_path = _required_str(cfg, "output_megatron_path")
+    command = build_megatron_lora_merge_command(cfg, merged_megatron_path=merged_megatron_path)
+
+    print(f"$ {shlex.join(command)}", flush=True)
+    subprocess.run(command, check=True)
+
+    if not _as_bool(cfg.get("export_hf", True)):
+        return
+
+    export_megatron_to_hf(
+        megatron_path=merged_megatron_path,
+        hf_model_id=_required_str(cfg, "hf_model_id", fallback_keys=("hf_model_path", "base_hf_path")),
+        hf_path=_required_str(cfg, "output_hf_path"),
+        trust_remote_code=_as_bool(cfg.get("trust_remote_code", True)),
+        show_progress=_as_bool(cfg.get("show_progress", True)),
+        strict=_as_bool(cfg.get("strict", True)),
+    )
+
+
+def build_megatron_lora_merge_command(
+    cfg: Mapping[str, Any],
+    *,
+    merged_megatron_path: str,
+) -> list[str]:
+    script = str(cfg.get("upstream_script") or "/opt/Megatron-Bridge/examples/peft/merge_lora.py")
+    cpu = _as_bool(cfg.get("cpu", True))
+    tp = int(cfg.get("tp", 1))
+    pp = int(cfg.get("pp", 1))
+    ep = int(cfg.get("ep", 1))
+
+    if cpu or (os.environ.get("WORLD_SIZE") and os.environ.get("RANK")):
+        command = [sys.executable, script]
+    else:
+        nproc = int(cfg.get("nproc_per_node") or cfg.get("gpus_per_node") or tp)
+        command = ["torchrun", f"--nproc_per_node={nproc}", script]
+
+    command.extend(
+        [
+            "--lora-checkpoint",
+            _required_str(cfg, "lora_checkpoint"),
+            "--hf-model-path",
+            _required_str(cfg, "hf_model_path", fallback_keys=("hf_model_id", "base_hf_path")),
+            "--output",
+            merged_megatron_path,
+            "--tp",
+            str(tp),
+            "--pp",
+            str(pp),
+            "--ep",
+            str(ep),
+        ]
+    )
+    pretrained = cfg.get("pretrained") or cfg.get("base_megatron_path")
+    if pretrained:
+        command.extend(["--pretrained", str(pretrained)])
+    if cpu:
+        command.append("--cpu")
+    if _as_bool(cfg.get("debug", False)):
+        command.append("--debug")
+    return command
+
+
+def _resolve_merge_backend(cfg: Mapping[str, Any]) -> str:
+    backend = str(cfg.get("backend") or "auto").lower()
+    if backend != "auto":
+        return backend
+    if cfg.get("base_megatron_path") or cfg.get("pretrained"):
+        return "megatron_bridge"
+    return "hf_peft"
+
+
+def _resolve_hf_peft_adapter_path(lora_checkpoint: str) -> str:
+    checkpoint_path = Path(lora_checkpoint)
+    if (checkpoint_path / _PEFT_ADAPTER_CONFIG).is_file():
+        return str(checkpoint_path)
+
+    if checkpoint_path.is_dir():
+        configs = sorted(
+            checkpoint_path.rglob(_PEFT_ADAPTER_CONFIG),
+            key=lambda path: _adapter_config_sort_key(path, checkpoint_path),
+        )
+        if configs:
+            return str(configs[-1].parent)
+
+    raise ValueError(
+        f"Can't find {_PEFT_ADAPTER_CONFIG!r} at or below {lora_checkpoint!r}. "
+        "Set lora_checkpoint to the adapter directory containing adapter_config.json. "
+        f"To inspect candidates, run: find {shlex.quote(lora_checkpoint)} -name {_PEFT_ADAPTER_CONFIG} -print"
+    )
+
+
+def _adapter_config_sort_key(config_path: Path, root_path: Path) -> tuple[int, float, str]:
+    try:
+        relative_parent = str(config_path.parent.relative_to(root_path))
+    except ValueError:
+        relative_parent = str(config_path.parent)
+
+    numbers = [int(match) for match in re.findall(r"\d+", relative_parent)]
+    checkpoint_step = max(numbers, default=-1)
+    try:
+        mtime = config_path.stat().st_mtime
+    except OSError:
+        mtime = 0.0
+    return checkpoint_step, mtime, str(config_path.parent)
+
+
+def _call_with_supported_kwargs(func: Callable[..., Any], **kwargs: Any) -> Any:
+    """Call a Megatron-Bridge helper while tolerating older signatures."""
+    return _call_with_supported_args(func, **kwargs)
+
+
+def _call_with_supported_args(func: Callable[..., Any], *args: Any, **kwargs: Any) -> Any:
+    """Call a helper with positional args and only supported keyword args."""
+    try:
+        signature = inspect.signature(func)
+    except (TypeError, ValueError):
+        return func(*args, **kwargs)
+
+    parameters = signature.parameters.values()
+    if any(param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters):
+        return func(*args, **kwargs)
+    supported = {key: value for key, value in kwargs.items() if key in signature.parameters}
+    return func(*args, **supported)
+
+
+def _autobridge_for_hf_export(
+    auto_bridge: type,
+    *,
+    megatron_path: str,
+    hf_model_id: str,
+    trust_remote_code: bool,
+) -> Any:
+    """Create an AutoBridge instance using the best API for Megatron -> HF export."""
+    from_hf_pretrained = getattr(auto_bridge, "from_hf_pretrained", None)
+    if from_hf_pretrained is not None:
+        return _call_with_supported_args(
+            from_hf_pretrained,
+            hf_model_id,
+            trust_remote_code=trust_remote_code,
+        )
+
+    from_auto_config = getattr(auto_bridge, "from_auto_config", None)
+    if from_auto_config is not None:
+        return _call_with_supported_args(
+            from_auto_config,
+            megatron_path,
+            hf_model_id,
+            trust_remote_code=trust_remote_code,
+        )
+
+    from_hf_config = getattr(auto_bridge, "from_hf_config", None)
+    if from_hf_config is not None:
+        from transformers import AutoConfig
+
+        hf_config = AutoConfig.from_pretrained(hf_model_id, trust_remote_code=trust_remote_code)
+        return _call_with_supported_args(from_hf_config, hf_config, trust_remote_code=trust_remote_code)
+
+    raise AttributeError(
+        "AutoBridge does not provide from_hf_pretrained, from_auto_config, or from_hf_config; "
+        "cannot construct a Megatron -> HF export bridge"
+    )
+
+
+def _torch_dtype(name: str) -> Any:
+    import torch
+
+    dtype_map = {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }
+    if name not in dtype_map:
+        raise ValueError(f"Unsupported torch dtype {name!r}; choose one of {sorted(dtype_map)}")
+    return dtype_map[name]
+
+
+def _required_str(cfg: Mapping[str, Any], key: str, *, fallback_keys: tuple[str, ...] = ()) -> str:
+    for candidate in (key, *fallback_keys):
+        value = cfg.get(candidate)
+        if value not in (None, ""):
+            return str(value)
+    names = ", ".join((key, *fallback_keys))
+    raise ValueError(f"Missing required config value: {names}")
+
+
+def _as_bool(value: Any) -> bool:
+    if isinstance(value, bool):
+        return value
+    if value is None:
+        return False
+    if isinstance(value, str):
+        return value.lower() in {"1", "true", "yes", "on"}
+    return bool(value)
diff --git a/src/nemotron/steps/benchmark/__init__.py b/src/nemotron/steps/benchmark/__init__.py
deleted file mode 100644
index e5c866753..000000000
--- a/src/nemotron/steps/benchmark/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Benchmark step category."""
diff --git a/src/nemotron/steps/byob/SKILL.md b/src/nemotron/steps/byob/SKILL.md
index 8500641f1..c71886633 100644
--- a/src/nemotron/steps/byob/SKILL.md
+++ b/src/nemotron/steps/byob/SKILL.md
@@ -16,10 +16,12 @@ Use this skill to create or translate benchmark artifacts while keeping benchmar
 
 1. Install BYOB runtime dependencies with `uv sync --extra byob` or `pip install ".[byob]"` in the target environment.
 2. Read [references/STEP.md](references/STEP.md) for the artifact contract.
-3. Start from [config/default.yaml](config/default.yaml) for MCQ generation or [config/translate.yaml](config/translate.yaml) for translation.
-4. Run `nemotron byob --family mcq --stage prepare --config CONFIG`.
-5. Run `nemotron byob --family mcq --stage generate --config CONFIG`.
-6. Translate an existing benchmark with `--stage translate` and a translation config.
+3. Start from [mcq/config/default.yaml](mcq/config/default.yaml) for MCQ generation or [mcq/config/translate.yaml](mcq/config/translate.yaml) for translation.
+4. Ensure generation configs include `target_source_mapping` and explicit
+   `filtering_model_configs`.
+5. Run `nemotron steps run byob/mcq -c <CONFIG> stage=prepare family=mcq`.
+6. Run `nemotron steps run byob/mcq -c <CONFIG> stage=generate family=mcq`.
+7. Translate an existing benchmark with `stage=translate` and a translation config.
 
 ## Change Points
 
@@ -46,7 +48,7 @@ Use this skill to create or translate benchmark artifacts while keeping benchmar
 ## Validate
 
 - Run `python -m nemotron.steps.byob.scripts.validate`.
-- Run `nemotron byob --list-families`.
+- Run `python -m nemotron.steps.byob.scripts.run --list-families`.
 - Confirm final generation outputs `benchmark_raw.parquet` and `benchmark.parquet`.
 - Confirm translated outputs preserve row count unless `remove_low_quality` is intentionally enabled.
 
diff --git a/src/nemotron/steps/byob/data/tiny_input/maths/tiny.txt b/src/nemotron/steps/byob/data/tiny_input/maths/tiny.txt
new file mode 100644
index 000000000..c46a8c1a6
--- /dev/null
+++ b/src/nemotron/steps/byob/data/tiny_input/maths/tiny.txt
@@ -0,0 +1 @@
+Algebra studies symbols and the rules for manipulating them. Linear equations can model simple relationships, such as converting between a starting value and a constant rate of change.
diff --git a/src/nemotron/steps/byob/config/default.yaml b/src/nemotron/steps/byob/mcq/config/default.yaml
similarity index 88%
rename from src/nemotron/steps/byob/config/default.yaml
rename to src/nemotron/steps/byob/mcq/config/default.yaml
index 98ccd8e0a..28a6a0604 100644
--- a/src/nemotron/steps/byob/config/default.yaml
+++ b/src/nemotron/steps/byob/mcq/config/default.yaml
@@ -1,6 +1,12 @@
+run:
+  env:
+    mounts:
+      - ${auto_mount:git+https://github.com/NVIDIA-NeMo/Curator.git@d10cd6ffe9f5ac4cbb176d7b3ada698f22633aea,/opt/Curator}
+
 expt_name: byob_mcq_default
 random_seed: 42
 ndd_batch_size: 32
+stage: all
 
 split: test
 subset: all
@@ -43,6 +49,9 @@ generation_model_config: &generator
         low: 0.9
         high: 1.0
     top_p: 1.0
+    extra_body:
+      chat_template_kwargs:
+        enable_thinking: false
 
 judge_model_config: *generator
 
diff --git a/src/nemotron/steps/byob/config/tiny.yaml b/src/nemotron/steps/byob/mcq/config/tiny.yaml
similarity index 66%
rename from src/nemotron/steps/byob/config/tiny.yaml
rename to src/nemotron/steps/byob/mcq/config/tiny.yaml
index a5054b602..0bdc904c5 100644
--- a/src/nemotron/steps/byob/config/tiny.yaml
+++ b/src/nemotron/steps/byob/mcq/config/tiny.yaml
@@ -1,10 +1,16 @@
+run:
+  env:
+    mounts:
+      - ${auto_mount:git+https://github.com/NVIDIA-NeMo/Curator.git@d10cd6ffe9f5ac4cbb176d7b3ada698f22633aea,/opt/Curator}
+
 expt_name: byob_mcq_tiny
 random_seed: 7
 ndd_batch_size: 2
+stage: all
 
 split: test
 subset: all
-input_dir: /tmp/byob/input
+input_dir: /nemo_run/code/src/nemotron/steps/byob/data/tiny_input
 output_dir: /tmp/byob/output
 hf_dataset: cais/mmlu
 language: en-US
@@ -25,25 +31,28 @@ chunking_config:
 prompt_config: null
 
 generation_model_config: &tiny_model
-  alias: local_test_model
-  model: local/test-model
-  provider: local
+  alias: gpt-oss-120b
+  model: openai/gpt-oss-120b
+  provider: nvidia
   inference_parameters:
     max_tokens: 1024
     max_parallel_requests: 1
     temperature: 0.0
     top_p: 1.0
+    extra_body:
+      chat_template_kwargs:
+        enable_thinking: false
 
 judge_model_config: *tiny_model
 
 semantic_deduplication_config:
-  enabled: true
+  enabled: false
   model_identifier: sentence-transformers/all-MiniLM-L6-v2
   n_clusters: 1
   eps: 0.07
   remove_duplicates: false
 
-do_distractor_expansion: false
+do_distractor_expansion: true
 distractor_expansion_model_config: *tiny_model
 
 do_coverage_check: false
@@ -54,7 +63,7 @@ coverage_check_config:
 distractor_validity_model_config: *tiny_model
 
 semantic_outlier_detection_config:
-  enabled: true
+  enabled: false
   model_identifier: sentence-transformers/all-MiniLM-L6-v2
   n_neighbours_min: 1
   remove_outliers: false
@@ -64,9 +73,11 @@ hallucination_threshold: 0.5
 remove_hallucinated: false
 remove_easy: false
 filtering_model_configs:
+  # Aliases must be unique within the filter stage's model_configs list; keep
+  # them distinct from the shared `gpt-oss-120b` alias the anchor provides.
   hallucination:
-    - alias: hal_1
+    - alias: hal_gpt-oss-120b
       <<: *tiny_model
   easiness:
-    - alias: eas_1
+    - alias: eas_gpt-oss-120b
       <<: *tiny_model
diff --git a/src/nemotron/steps/byob/config/translate.yaml b/src/nemotron/steps/byob/mcq/config/translate.yaml
similarity index 77%
rename from src/nemotron/steps/byob/config/translate.yaml
rename to src/nemotron/steps/byob/mcq/config/translate.yaml
index 2e4bf0763..e61549887 100644
--- a/src/nemotron/steps/byob/config/translate.yaml
+++ b/src/nemotron/steps/byob/mcq/config/translate.yaml
@@ -1,4 +1,10 @@
+run:
+  env:
+    mounts:
+      - ${auto_mount:git+https://github.com/NVIDIA-NeMo/Curator.git@d10cd6ffe9f5ac4cbb176d7b3ada698f22633aea,/opt/Curator}
+
 expt_name: byob_mcq_translation
+stage: translate
 dataset_path: /workspace/byob/outputs/byob_mcq_default/benchmark.parquet
 output_dir: /workspace/byob/outputs
 source_language: en-US
@@ -17,6 +23,9 @@ translation_model_config:
       max_parallel_requests: 8
       temperature: 0.0
       top_p: 0.95
+      extra_body:
+        chat_template_kwargs:
+          enable_thinking: false
   stage:
     segmentation_mode: coarse
     min_segment_chars: 0
diff --git a/src/nemotron/steps/byob/step.py b/src/nemotron/steps/byob/mcq/step.py
similarity index 84%
rename from src/nemotron/steps/byob/step.py
rename to src/nemotron/steps/byob/mcq/step.py
index f8128ef34..62500371d 100644
--- a/src/nemotron/steps/byob/step.py
+++ b/src/nemotron/steps/byob/mcq/step.py
@@ -2,7 +2,7 @@
 # /// script
 # [tool.runspec]
 # schema = "1"
-# name = "steps/byob"
+# name = "steps/byob/mcq"
 #
 # [tool.runspec.run]
 # launch = "python"
@@ -16,7 +16,7 @@
 # nodes = 1
 # gpus_per_node = 0
 # ///
-"""Run the BYOB benchmark step."""
+"""Run the BYOB MCQ benchmark step."""
 
 from nemotron.steps.byob.scripts.run import main
 
diff --git a/src/nemotron/steps/byob/step.toml b/src/nemotron/steps/byob/mcq/step.toml
similarity index 60%
rename from src/nemotron/steps/byob/step.toml
rename to src/nemotron/steps/byob/mcq/step.toml
index 307561dc0..f1238aeaf 100644
--- a/src/nemotron/steps/byob/step.toml
+++ b/src/nemotron/steps/byob/mcq/step.toml
@@ -1,6 +1,6 @@
 [step]
-id = "byob"
-name = "Bring Your Own Benchmark"
+id = "byob/mcq"
+name = "Bring Your Own Benchmark — MCQ"
 category = "byob"
 description = "Generate and translate BYOB MCQ benchmark parquet artifacts from domain documents with an extensible benchmark-family runtime."
 tags = ["benchmark", "byob", "mcq", "data-designer", "curator", "translation"]
@@ -33,30 +33,38 @@ choices = ["mcq"]
 name = "stage"
 description = "BYOB pipeline stage to run."
 default = "generate"
-choices = ["prepare", "generate", "translate"]
+choices = ["prepare", "generate", "translate", "all"]
 
 [[parameters]]
 name = "config"
 description = "Path to a BYOB YAML config."
 
+[[parameters]]
+name = "target_source_mapping"
+description = "Benchmark target subjects mapped to source document directories or parquet references. Required for MCQ generation."
+
+[[parameters]]
+name = "filtering_model_configs"
+description = "Model configs used by quality filtering and semantic dedup stages. Keep these explicit for reproducible benchmark generation."
+
 [[parameters]]
 name = "skip_until"
 description = "Optional resume stage enum name for cached generation or translation runs."
 
 [[strategies]]
 when = "You need a domain-specific multiple-choice benchmark from user-provided documents"
-then = "Use family=mcq, run prepare before generate, and start from config/default.yaml or config/tiny.yaml for a smoke run. Curator semantic dedup uses RayDataExecutor, RayActorPoolExecutor, and package-level SemanticDeduplicationWorkflow."
-skill = "skills/nemotron-customize/context/byob-benchmark-curator-translation.txt"
+then = "Use family=mcq with stage=all, or run prepare before generate, and start from config/default.yaml or config/tiny.yaml for a smoke run. Curator semantic dedup uses RayDataExecutor, RayActorPoolExecutor, and package-level SemanticDeduplicationWorkflow."
+skill = "skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt"
 
 [[strategies]]
 when = "You need to extend BYOB beyond MCQ, for example GSM8K-style math problems"
 then = "Answer references/new-family-checklist.md first, then add a package under runtime/benchmark_families/ and register it without changing the dispatcher."
-skill = "skills/nemotron-customize/context/byob-benchmark-curator-translation.txt"
+skill = "skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt"
 
 [[strategies]]
 when = "You need to translate an existing BYOB MCQ benchmark"
 then = "Use stage=translate with config/translate.yaml. Curator experimental translation performs translation; BYOB preserves question_id, options, answer_index, answer, cot_content, src, and category."
-skill = "skills/nemotron-customize/context/byob-benchmark-curator-translation.txt"
+skill = "skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt"
 
 [[errors]]
 name = "missing_byob_source_documents"
@@ -74,13 +82,22 @@ recovery = "Keep final MCQ benchmark.parquet columns aligned with references/ben
 name = "new_family_keeps_mcq_assumptions"
 recovery = "For non-MCQ families, define a family-specific schema and replace MCQ-only stages such as distractor expansion and answer-letter validation."
 
+[[errors]]
+name = "resume_cache_missing"
+recovery = "Use skip_until only when the previous stage cache parquet exists under output_dir/expt_name/stage_cache."
+
+[[errors]]
+name = "translation_row_count_drift"
+recovery = "Translated outputs should preserve row count unless remove_low_quality is intentionally enabled; inspect quality cache before filtering."
+
 [reference]
 skill = "src/nemotron/steps/byob/SKILL.md"
-script = "src/nemotron/steps/byob/step.py"
-default_config = "src/nemotron/steps/byob/config/default.yaml"
-tiny_config = "src/nemotron/steps/byob/config/tiny.yaml"
-translate_config = "src/nemotron/steps/byob/config/translate.yaml"
+contract = "src/nemotron/steps/byob/references/STEP.md"
+script = "src/nemotron/steps/byob/mcq/step.py"
+default_config = "src/nemotron/steps/byob/mcq/config/default.yaml"
+tiny_config = "src/nemotron/steps/byob/mcq/config/tiny.yaml"
+translate_config = "src/nemotron/steps/byob/mcq/config/translate.yaml"
 runtime = "src/nemotron/steps/byob/runtime/benchmark_families/mcq/pipeline.py"
 skills = [
-  "skills/nemotron-customize/context/byob-benchmark-curator-translation.txt",
+  "skills/nemotron-customize/references/context/byob-benchmark-curator-translation.txt",
 ]
diff --git a/src/nemotron/steps/byob/references/STEP.md b/src/nemotron/steps/byob/references/STEP.md
index 0f813bd47..abcfbbefc 100644
--- a/src/nemotron/steps/byob/references/STEP.md
+++ b/src/nemotron/steps/byob/references/STEP.md
@@ -1,11 +1,11 @@
 ---
-id: nemotron.steps.byob
-version: 0.1
+id: nemotron.steps.byob.mcq
+version: 0.2
 owner: nemotron
 summary: Generate and translate bring-your-own MCQ benchmarks from domain documents.
 entrypoint:
   kind: cli
-  command: nemotron byob
+  command: nemotron steps run byob/mcq
   module: nemotron.steps.byob.scripts.run
 consumes:
   - type: benchmark_source_corpus
diff --git a/src/nemotron/steps/byob/references/guide.md b/src/nemotron/steps/byob/references/guide.md
index 3353e3cb5..d68e3b1a0 100644
--- a/src/nemotron/steps/byob/references/guide.md
+++ b/src/nemotron/steps/byob/references/guide.md
@@ -22,13 +22,34 @@ lives in `runtime/benchmark_families/mcq/pipeline.py`. The generation run:
 8. Applies hallucination and easiness filters.
 9. Exports the final MCQ benchmark parquet.
 
-Run generation through the installed CLI:
+Run generation through the generic step dispatcher:
 
 ```bash
-nemotron byob --family mcq --stage prepare --config src/nemotron/steps/byob/config/default.yaml
-nemotron byob --family mcq --stage generate --config src/nemotron/steps/byob/config/default.yaml
+nemotron steps run byob/mcq -c default stage=prepare family=mcq
+nemotron steps run byob/mcq -c default stage=generate family=mcq
 ```
 
+You can also pass a YAML config path explicitly to ``-c``:
+
+```bash
+nemotron steps run byob/mcq \
+  -c src/nemotron/steps/byob/mcq/config/default.yaml \
+  stage=prepare family=mcq
+```
+
+The BYOB configs declare Curator through the normal step config mount path:
+
+```yaml
+run:
+  env:
+    mounts:
+      - ${auto_mount:git+https://github.com/NVIDIA-NeMo/Curator.git@d10cd6ffe9f5ac4cbb176d7b3ada698f22633aea,/opt/Curator}
+```
+
+Remote profiles should make `/opt/Curator` visible through `PYTHONPATH` or an
+image-level install and install BYOB runtime dependencies in the profile. The
+step entrypoint intentionally stays a thin CLI wrapper.
+
 When editing semantic deduplication, keep the Curator imports aligned with the runtime:
 `nemo_curator.backends.ray_data.RayDataExecutor`,
 `nemo_curator.backends.ray_actor_pool.RayActorPoolExecutor`, and
@@ -41,7 +62,7 @@ questions and choices into text rows, Curator experimental translation translate
 the MCQ schema, Curator computes configured backtranslation quality metrics, and BYOB exports the final
 translated benchmark.
 
-Run translation through the same CLI with `--stage translate` and a translation config. Keep
+Run translation through the same dispatcher with `stage=translate` and a translation config. Keep
 Curator settings under `translation_model_config`; BYOB does not maintain a separate translation engine or mode selector.
 
 ## Extending To Another Family
diff --git a/src/nemotron/steps/byob/runtime/benchmark_families/mcq/pipeline.py b/src/nemotron/steps/byob/runtime/benchmark_families/mcq/pipeline.py
index e9f3b107f..43d96e459 100644
--- a/src/nemotron/steps/byob/runtime/benchmark_families/mcq/pipeline.py
+++ b/src/nemotron/steps/byob/runtime/benchmark_families/mcq/pipeline.py
@@ -63,8 +63,6 @@ def generate_mcq(config_path: str | os.PathLike[str], *, skip_until: str | None
     """Run the MCQ benchmark generation pipeline and return the final benchmark path."""
     import pandas as pd
 
-    from nemotron.steps.byob.runtime.benchmark_families.mcq.deduplication import TextSemanticDeduplicationMCQ
-    from nemotron.steps.byob.runtime.benchmark_families.mcq.semantic_outlier import TextSemanticOutlierDetectionMCQ
     from nemotron.steps.byob.runtime.benchmark_families.mcq.stages import (
         check_distractor_validity,
         expand_distractors,
@@ -72,7 +70,6 @@ def generate_mcq(config_path: str | os.PathLike[str], *, skip_until: str | None
         generate_questions,
         judge_questions,
     )
-    from nemotron.steps.byob.runtime.benchmark_families.mcq.text_coverage import TextCoverageMCQ
     from nemotron.steps.byob.runtime.benchmark_families.mcq.utils import (
         postprocess_distractor_expansion,
         postprocess_distractor_validity,
@@ -125,6 +122,8 @@ def generate_mcq(config_path: str | os.PathLike[str], *, skip_until: str | None
     if _should_run(skip_until, McqGenerationStage.SEMANTIC_DEDUPLICATION):
         dataset_in = pd.read_parquet(last_output_path)
         if _is_enabled(config.semantic_deduplication_config):
+            from nemotron.steps.byob.runtime.benchmark_families.mcq.deduplication import TextSemanticDeduplicationMCQ
+
             dataset_out = TextSemanticDeduplicationMCQ(config).run(dataset_in)
         else:
             dataset_out = dataset_in.copy()
@@ -144,6 +143,8 @@ def generate_mcq(config_path: str | os.PathLike[str], *, skip_until: str | None
 
     if config.do_coverage_check:
         if _should_run(skip_until, McqGenerationStage.COVERAGE_CHECK):
+            from nemotron.steps.byob.runtime.benchmark_families.mcq.text_coverage import TextCoverageMCQ
+
             dataset_in = pd.read_parquet(last_output_path)
             dataset_out = TextCoverageMCQ(config).analyze(dataset_in)
             dataset_out.to_parquet(paths["coverage"])
@@ -161,6 +162,8 @@ def generate_mcq(config_path: str | os.PathLike[str], *, skip_until: str | None
     if _should_run(skip_until, McqGenerationStage.SEMANTIC_OUTLIER_DETECTION):
         dataset_in = pd.read_parquet(last_output_path)
         if _is_enabled(config.semantic_outlier_detection_config):
+            from nemotron.steps.byob.runtime.benchmark_families.mcq.semantic_outlier import TextSemanticOutlierDetectionMCQ
+
             dataset_out = TextSemanticOutlierDetectionMCQ(config).detect(dataset_in)
         else:
             dataset_out = dataset_in.copy()
diff --git a/src/nemotron/steps/byob/scripts/run.py b/src/nemotron/steps/byob/scripts/run.py
index 44113deb4..6ad1aaf5e 100644
--- a/src/nemotron/steps/byob/scripts/run.py
+++ b/src/nemotron/steps/byob/scripts/run.py
@@ -5,17 +5,23 @@
 import argparse
 from pathlib import Path
 
-from nemotron.steps.byob.scripts.runtime import list_family_names, run_byob
+from nemotron.steps.byob.scripts.runtime import (
+    STAGE_CHOICES,
+    list_family_names,
+    load_dispatch_config,
+    resolve_dispatch_value,
+    run_byob,
+)
 
 
 def build_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Run a BYOB benchmark family stage")
     parser.add_argument("--config", type=Path, help="Path to the BYOB YAML config")
-    parser.add_argument("--family", default="mcq", help="Benchmark family to run")
+    parser.add_argument("--family", default=None, help="Benchmark family to run")
     parser.add_argument(
         "--stage",
-        choices=("prepare", "generate", "translate"),
-        help="Pipeline stage to run",
+        choices=STAGE_CHOICES,
+        help="Pipeline stage to run. Use `all` to chain prepare and generate.",
     )
     parser.add_argument(
         "--skip-until",
@@ -35,15 +41,18 @@ def main() -> None:
             print(family)
         return
 
-    if args.config is None or args.stage is None:
-        parser.error("--config and --stage are required unless --list-families is set")
+    if args.config is None:
+        parser.error("--config is required unless --list-families is set")
 
-    output_path = run_byob(
-        config=args.config,
-        stage=args.stage,
-        family=args.family,
-        skip_until=args.skip_until,
-    )
+    yaml_dict = load_dispatch_config(args.config)
+    stage = resolve_dispatch_value(args.stage, yaml_dict, "stage")
+    family = resolve_dispatch_value(args.family, yaml_dict, "family", default="mcq")
+    skip_until = resolve_dispatch_value(args.skip_until, yaml_dict, "skip_until")
+
+    if stage is None:
+        parser.error("--stage is required unless the config contains `stage`")
+
+    output_path = run_byob(config=args.config, stage=stage, family=family, skip_until=skip_until)
     if output_path is not None:
         print(output_path)
 
diff --git a/src/nemotron/steps/byob/scripts/runtime.py b/src/nemotron/steps/byob/scripts/runtime.py
index cb7bae331..c3953e770 100644
--- a/src/nemotron/steps/byob/scripts/runtime.py
+++ b/src/nemotron/steps/byob/scripts/runtime.py
@@ -9,9 +9,12 @@
 from pathlib import Path
 from typing import Literal
 
+import yaml
+
 from nemotron.steps.byob.runtime.benchmark_families.registry import get_family, list_families
 
-StageName = Literal["prepare", "generate", "translate"]
+STAGE_CHOICES = ("prepare", "generate", "translate", "all")
+StageName = Literal["prepare", "generate", "translate", "all"]
 
 
 def list_family_names() -> tuple[str, ...]:
@@ -19,6 +22,18 @@ def list_family_names() -> tuple[str, ...]:
     return tuple(list_families())
 
 
+def load_dispatch_config(config_path: str | Path) -> dict:
+    """Parse the BYOB YAML config; returns ``{}`` for empty/non-mapping payloads."""
+    with Path(config_path).open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    return data if isinstance(data, dict) else {}
+
+
+def resolve_dispatch_value(arg_value, yaml_dict: dict, yaml_key: str, default=None):
+    """Resolve CLI/YAML dispatch values without coupling to one CLI framework."""
+    return arg_value or yaml_dict.get(yaml_key, default)
+
+
 def run_byob(
     *,
     config: str | Path,
@@ -30,6 +45,9 @@ def run_byob(
     spec = get_family(family)
     config_path = Path(config)
 
+    if stage == "all":
+        spec.prepare_data(config_path)
+        return spec.generate(config_path, skip_until=skip_until)
     if stage == "prepare":
         return spec.prepare_data(config_path)
     if stage == "generate":
diff --git a/src/nemotron/steps/convert/SKILL.md b/src/nemotron/steps/convert/SKILL.md
index 8b13ae3b3..1ed8bfa68 100644
--- a/src/nemotron/steps/convert/SKILL.md
+++ b/src/nemotron/steps/convert/SKILL.md
@@ -12,9 +12,9 @@ on `checkpoint_*` type. The artifact graph in
 
 | Source type | Target type | Step |
 |---|---|---|
-| `checkpoint_megatron` | `checkpoint_hf` | [megatron_to_hf](megatron_to_hf/step.toml) |
-| `checkpoint_hf` | `checkpoint_megatron` | [hf_to_megatron](hf_to_megatron/step.toml) |
-| `checkpoint_lora` (+ base `checkpoint_hf`) | `checkpoint_hf` (merged) | [merge_lora](merge_lora/step.toml) |
+| `checkpoint_megatron` | `checkpoint_hf` | [megatron_to_hf](megatron_to_hf/SKILL.md) |
+| `checkpoint_hf` | `checkpoint_megatron` | [hf_to_megatron](hf_to_megatron/SKILL.md) |
+| `checkpoint_lora` (+ original base) | `checkpoint_hf` (merged) | [merge_lora](merge_lora/SKILL.md) |
 
 ## When to insert
 
@@ -22,8 +22,10 @@ on `checkpoint_*` type. The artifact graph in
   consumers that expect HF format need `megatron_to_hf` first.
 - AutoModel SFT/PEFT produces `checkpoint_hf`. Megatron-Bridge consumers need
   `hf_to_megatron` first.
-- Any LoRA producer (`peft/*`) emits `checkpoint_lora`. Eval and RL almost
-  always want a merged HF model, not the adapter alone — chain `merge_lora`.
+- Any LoRA producer (`peft/*`) emits `checkpoint_lora`. HF/PEFT adapters can
+  merge directly with `merge_lora backend=hf_peft`; Megatron-Bridge adapters
+  use `backend=megatron_bridge`. `backend=auto` chooses from the base path
+  fields and can export a Megatron-Bridge merge to HF in the same step.
 
 ## Patterns to cite
 
@@ -36,7 +38,10 @@ on `checkpoint_*` type. The artifact graph in
 
 - Don't add a converter "just in case." Pick one input artifact type per
   consumer and configure to match.
+- Read the selected converter's `step.toml`; it now carries required paths,
+  merge provenance, and conversion failure modes.
 - When converting Megatron → HF, point at the specific `iter_*` directory,
   not the parent run dir.
 - When merging LoRA, you need the *original* base checkpoint the adapter was
-  trained against. Don't merge into a different base.
+  trained against. For Megatron-Bridge adapters, preserve the dense Megatron
+  base and an HF model/config source for export.
diff --git a/src/nemotron/steps/convert/hf_to_megatron/SKILL.md b/src/nemotron/steps/convert/hf_to_megatron/SKILL.md
new file mode 100644
index 000000000..3661e2993
--- /dev/null
+++ b/src/nemotron/steps/convert/hf_to_megatron/SKILL.md
@@ -0,0 +1,36 @@
+---
+name: nemotron-convert-hf-to-megatron
+description: Configure convert/hf_to_megatron to import a Hugging Face safetensors checkpoint into Megatron distributed checkpoint layout for Megatron-Bridge consumers.
+---
+
+# HF To Megatron Conversion
+
+Use `convert/hf_to_megatron` when a downstream Megatron-Bridge step needs
+`checkpoint_megatron` but the upstream artifact is `checkpoint_hf`.
+
+Before changing configs or code, read `step.toml` for the artifact contract,
+parameters, strategies, and failure modes.
+
+## Inputs And Outputs
+
+- Consume a clean HF checkpoint directory or model id.
+- Produce a Megatron distributed checkpoint in a fresh output directory.
+- Keep tokenizer and model config files resolvable during import.
+
+## Configure
+
+- Set `hf_model_id` to the HF model id or local checkpoint path.
+- Set `megatron_path` to a new output directory.
+- Keep `torch_dtype=bfloat16` for typical Nemotron/NVIDIA checkpoints unless a source
+  model requires another dtype.
+- Set `device_map` only when the installed Megatron-Bridge/Transformers stack
+  expects one for local loading.
+- Merge LoRA adapters before importing them into Megatron layout.
+
+## Guardrails
+
+- Do not import trainer-state directories, optimizer folders, or adapter-only
+  outputs.
+- Do not write the Megatron output under the HF source directory.
+- Keep `trust_remote_code=true` only for model repos you trust and whose
+  architecture is supported by the installed Megatron-Bridge AutoBridge.
diff --git a/src/nemotron/steps/convert/hf_to_megatron/config/default.yaml b/src/nemotron/steps/convert/hf_to_megatron/config/default.yaml
new file mode 100644
index 000000000..8507f987f
--- /dev/null
+++ b/src/nemotron/steps/convert/hf_to_megatron/config/default.yaml
@@ -0,0 +1,27 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Convert a HuggingFace checkpoint or model id into Megatron distributed layout.
+#
+# Usage:
+#   nemotron steps run convert/hf_to_megatron -c default \
+#     hf_model_id=/path/to/hf_checkpoint \
+#     megatron_path=/path/to/output_megatron_checkpoint
+
+hf_model_id: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+megatron_path: ${oc.env:CONVERT_OUTPUT_DIR,${oc.env:NEMO_RUN_DIR,${oc.env:PWD}/output}/convert}/hf_to_megatron/megatron
+
+torch_dtype: bfloat16
+device_map: null
+trust_remote_code: true
diff --git a/src/nemotron/steps/convert/hf_to_megatron/step.py b/src/nemotron/steps/convert/hf_to_megatron/step.py
index 62f5a1446..3b262d16d 100644
--- a/src/nemotron/steps/convert/hf_to_megatron/step.py
+++ b/src/nemotron/steps/convert/hf_to_megatron/step.py
@@ -3,19 +3,48 @@
 # [tool.runspec]
 # schema = "1"
 # name = "steps/convert/hf_to_megatron"
+# image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano"
 #
 # [tool.runspec.run]
 # launch = "python"
+#
+# [tool.runspec.config]
+# dir = "./config"
+# default = "default"
+# format = "omegaconf"
+#
+# [tool.runspec.resources]
+# nodes = 1
+# gpus_per_node = 8
 # ///
-"""HF -> Megatron pattern. Real script: Megatron-Bridge/examples/conversion/convert_checkpoints.py"""
-import torch
-from megatron.bridge import AutoBridge
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""HF -> Megatron conversion using Megatron-Bridge AutoBridge."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemotron.steps._runners.convert import run_hf_to_megatron
+
+DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 
 
 def main() -> None:
-    hf_model = "nvidia/Nemotron-3-Nano-30B-A3B"
-    megatron_path = "/path/to/megatron_ckpt"
-    AutoBridge.import_ckpt(hf_model_id=hf_model, megatron_path=megatron_path, torch_dtype=torch.bfloat16)
+    run_hf_to_megatron(DEFAULT_CONFIG)
 
 
 if __name__ == "__main__":
diff --git a/src/nemotron/steps/convert/hf_to_megatron/step.toml b/src/nemotron/steps/convert/hf_to_megatron/step.toml
index 19fa88587..da07d5826 100644
--- a/src/nemotron/steps/convert/hf_to_megatron/step.toml
+++ b/src/nemotron/steps/convert/hf_to_megatron/step.toml
@@ -27,5 +27,59 @@ description = "HuggingFace safetensors checkpoint"
 type = "checkpoint_megatron"
 description = "Megatron distributed checkpoint"
 
+[[parameters]]
+name = "hf_model_id"
+description = "HF model id or local checkpoint path to import. Use a clean model directory, not a training-state folder."
+
+[[parameters]]
+name = "megatron_path"
+description = "Output directory for the Megatron distributed checkpoint. Keep it separate from the HF source path."
+
+[[parameters]]
+name = "torch_dtype"
+description = "Torch dtype used during import, normally bfloat16 for Nemotron/NVIDIA checkpoints."
+default = "bfloat16"
+choices = ["bfloat16", "float16", "float32"]
+
+[[parameters]]
+name = "dtype"
+description = "Deprecated alias for torch_dtype."
+
+[[parameters]]
+name = "device_map"
+description = "Optional Transformers device_map forwarded during HF model loading, such as auto or cuda:0."
+default = ""
+
+[[parameters]]
+name = "trust_remote_code"
+description = "Whether to trust HF custom model code when AutoBridge loads the source model config."
+default = true
+
+[[strategies]]
+when = "A Megatron-Bridge consumer needs an HF-produced checkpoint"
+then = "Insert this converter explicitly instead of changing the downstream trainer to accept HF layout."
+
+[[strategies]]
+when = "The source checkpoint came from LoRA training"
+then = "Merge the adapter into its original HF base first with convert/merge_lora, then import the merged HF checkpoint."
+
+[[strategies]]
+when = "Tokenizer or model config is missing from the HF directory"
+then = "Use the original HF model id/path as hf_model_id or copy the tokenizer/config files before conversion."
+
+[[errors]]
+name = "source_not_clean_hf_checkpoint"
+recovery = "Point hf_model_id at a model directory with config/tokenizer/safetensors, not at trainer logs, optimizer state, or adapter-only output."
+
+[[errors]]
+name = "output_path_overlaps_source"
+recovery = "Set megatron_path to a new directory so a failed import cannot corrupt the HF source checkpoint."
+
+[[errors]]
+name = "unsupported_or_custom_architecture"
+recovery = "Keep trust_remote_code=true for supported custom HF repos, and verify the installed Megatron-Bridge AutoBridge supports the architecture."
+
 [reference]
+skill = "src/nemotron/steps/convert/hf_to_megatron/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py"
+skills = ["skills/nemotron-customize/references/context/checkpoint-conversion.txt"]
diff --git a/src/nemotron/steps/convert/megatron_to_hf/SKILL.md b/src/nemotron/steps/convert/megatron_to_hf/SKILL.md
new file mode 100644
index 000000000..cc6a68dc3
--- /dev/null
+++ b/src/nemotron/steps/convert/megatron_to_hf/SKILL.md
@@ -0,0 +1,36 @@
+---
+name: nemotron-convert-megatron-to-hf
+description: Configure convert/megatron_to_hf to export a Megatron distributed checkpoint iteration into Hugging Face safetensors layout for evaluation, deployment, optimization, or adapter merge.
+---
+
+# Megatron To HF Conversion
+
+Use `convert/megatron_to_hf` when a downstream HF-native step needs
+`checkpoint_hf` but the upstream artifact is `checkpoint_megatron`.
+
+Before changing configs or code, read `step.toml` for the artifact contract,
+parameters, strategies, and failure modes.
+
+## Inputs And Outputs
+
+- Consume a specific Megatron checkpoint iteration, normally an `iter_*`
+  directory.
+- Produce a standalone HF safetensors checkpoint.
+- Preserve tokenizer and config expectations from the original HF model id.
+
+## Configure
+
+- Set `megatron_path` to the concrete checkpoint iteration, not the parent run
+  directory.
+- Set `hf_model_id` to the original model/config source when the checkpoint
+  lacks enough HF metadata.
+- Set `hf_path` to a fresh export directory.
+- Keep `strict=true` unless you intentionally accept source/target checkpoint
+  key mismatches for a known architecture drift.
+
+## Guardrails
+
+- Do not export while async checkpoint save is still in progress.
+- Do not guess among multiple checkpoint iterations; pick the validated one.
+- Validate that the exported HF checkpoint loads before using it for eval or
+  deployment.
diff --git a/src/nemotron/steps/convert/megatron_to_hf/config/default.yaml b/src/nemotron/steps/convert/megatron_to_hf/config/default.yaml
new file mode 100644
index 000000000..3096e1f60
--- /dev/null
+++ b/src/nemotron/steps/convert/megatron_to_hf/config/default.yaml
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Convert a concrete Megatron checkpoint directory, normally an iter_* folder,
+# into HuggingFace safetensors layout.
+#
+# Usage:
+#   nemotron steps run convert/megatron_to_hf -c default \
+#     megatron_path=/path/to/megatron/iter_0000100 \
+#     hf_model_id=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
+#     hf_path=/path/to/output_hf_checkpoint
+
+megatron_path: null
+hf_model_id: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+hf_path: ${oc.env:CONVERT_OUTPUT_DIR,${oc.env:NEMO_RUN_DIR,${oc.env:PWD}/output}/convert}/megatron_to_hf/hf
+
+trust_remote_code: true
+show_progress: true
+strict: true
diff --git a/src/nemotron/steps/convert/megatron_to_hf/step.py b/src/nemotron/steps/convert/megatron_to_hf/step.py
index 64a9a9324..62a66114e 100644
--- a/src/nemotron/steps/convert/megatron_to_hf/step.py
+++ b/src/nemotron/steps/convert/megatron_to_hf/step.py
@@ -3,20 +3,48 @@
 # [tool.runspec]
 # schema = "1"
 # name = "steps/convert/megatron_to_hf"
+# image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano"
 #
 # [tool.runspec.run]
 # launch = "python"
+#
+# [tool.runspec.config]
+# dir = "./config"
+# default = "default"
+# format = "omegaconf"
+#
+# [tool.runspec.resources]
+# nodes = 1
+# gpus_per_node = 8
 # ///
-"""Megatron -> HF pattern. Real script: Megatron-Bridge/examples/conversion/convert_checkpoints.py"""
-from megatron.bridge import AutoBridge
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Megatron -> HF conversion using Megatron-Bridge AutoBridge."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemotron.steps._runners.convert import run_megatron_to_hf
+
+DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 
 
 def main() -> None:
-    megatron_path = "/path/to/megatron_ckpt"
-    hf_model = "nvidia/Nemotron-3-Nano-30B-A3B"
-    hf_path = "/path/to/hf_safetensors"
-    bridge = AutoBridge.from_auto_config(megatron_path, hf_model, trust_remote_code=True)
-    bridge.export_ckpt(megatron_path=megatron_path, hf_path=hf_path)
+    run_megatron_to_hf(DEFAULT_CONFIG)
 
 
 if __name__ == "__main__":
diff --git a/src/nemotron/steps/convert/megatron_to_hf/step.toml b/src/nemotron/steps/convert/megatron_to_hf/step.toml
index 57ffda969..edfbdf783 100644
--- a/src/nemotron/steps/convert/megatron_to_hf/step.toml
+++ b/src/nemotron/steps/convert/megatron_to_hf/step.toml
@@ -27,5 +27,58 @@ description = "Megatron distributed checkpoint"
 type = "checkpoint_hf"
 description = "HuggingFace safetensors checkpoint"
 
+[[parameters]]
+name = "megatron_path"
+description = "Specific Megatron checkpoint directory to export, normally the iter_* directory rather than the parent run folder."
+
+[[parameters]]
+name = "hf_model_id"
+description = "HF model id or config source used by AutoBridge to reconstruct the HF architecture and tokenizer expectations."
+
+[[parameters]]
+name = "hf_path"
+description = "Output directory for the exported HF safetensors checkpoint."
+
+[[parameters]]
+name = "trust_remote_code"
+description = "Whether to trust custom HF model code while reconstructing the export target."
+default = true
+
+[[parameters]]
+name = "show_progress"
+description = "Whether to show Megatron-Bridge export progress output."
+default = true
+
+[[parameters]]
+name = "strict"
+description = "Whether Megatron-Bridge should require source and target checkpoint keys to match strictly."
+default = true
+
+[[strategies]]
+when = "The next consumer is HF-native evaluation, deployment, pruning, quantization input, or adapter merge"
+then = "Export the specific Megatron iter_* checkpoint to HF before running the downstream step."
+
+[[strategies]]
+when = "A training run produced multiple checkpoint iterations"
+then = "Pick the validated iter_* directory explicitly; do not point at the parent run directory."
+
+[[strategies]]
+when = "The export will be used for log-probability evaluation"
+then = "Ensure tokenizer files are available in the HF output or are resolvable from hf_model_id."
+
+[[errors]]
+name = "bad_megatron_checkpoint_path"
+recovery = "Point megatron_path at the concrete iter_* checkpoint directory with distributed model shards."
+
+[[errors]]
+name = "missing_hf_config_or_tokenizer"
+recovery = "Set hf_model_id to the original HF model/config source so AutoBridge can reconstruct the export."
+
+[[errors]]
+name = "incomplete_or_async_checkpoint"
+recovery = "Use a fully written checkpoint iteration. Avoid exporting while async checkpoint save is still in progress."
+
 [reference]
+skill = "src/nemotron/steps/convert/megatron_to_hf/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/conversion/convert_checkpoints.py"
+skills = ["skills/nemotron-customize/references/context/checkpoint-conversion.txt"]
diff --git a/src/nemotron/steps/convert/merge_lora/SKILL.md b/src/nemotron/steps/convert/merge_lora/SKILL.md
new file mode 100644
index 000000000..b2f2df104
--- /dev/null
+++ b/src/nemotron/steps/convert/merge_lora/SKILL.md
@@ -0,0 +1,40 @@
+---
+name: nemotron-convert-merge-lora
+description: Configure convert/merge_lora to merge a LoRA adapter into its original base checkpoint and produce a standalone HF checkpoint.
+---
+
+# Merge LoRA
+
+Use `convert/merge_lora` when a downstream consumer needs a standalone
+`checkpoint_hf` instead of a separate adapter artifact.
+
+Before changing configs or code, read `step.toml` for the artifact contract,
+parameters, strategies, and failure modes.
+
+## Inputs And Outputs
+
+- Consume `checkpoint_lora` plus the original base checkpoint.
+- With `backend=hf_peft`, consume the original HF base and write HF output
+  directly.
+- With `backend=megatron_bridge`, consume the original dense Megatron base,
+  write a merged Megatron checkpoint, then export it to HF when `export_hf=true`.
+
+## Configure
+
+- Keep `backend=auto` unless you want to force a merge path.
+- Set `backend=hf_peft` for AutoModel/HuggingFace PEFT adapters.
+- Set `backend=megatron_bridge` for Megatron-Bridge adapters.
+- Set `lora_checkpoint` to the adapter output from the PEFT run.
+- For HF PEFT, set `base_hf_path` to the exact base model used during adapter
+  training and `output_hf_path` to a fresh directory.
+- For Megatron-Bridge, set `base_megatron_path`, `hf_model_id` or
+  `hf_model_path`, `output_megatron_path`, and `output_hf_path`.
+- Use CPU merge for memory-constrained or non-training environments when
+  parallelism is 1.
+
+## Guardrails
+
+- Never merge into a different base, even if the model name looks compatible.
+- Evaluate after merge; adapter-loaded and merged-model scores can differ.
+- Keep tokenizer, chat template, LoRA rank, alpha, and target module provenance
+  with the merged artifact.
diff --git a/src/nemotron/steps/convert/merge_lora/config/default.yaml b/src/nemotron/steps/convert/merge_lora/config/default.yaml
new file mode 100644
index 000000000..113751811
--- /dev/null
+++ b/src/nemotron/steps/convert/merge_lora/config/default.yaml
@@ -0,0 +1,64 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Merge a LoRA adapter into its original base checkpoint.
+#
+# backend=hf_peft merges HuggingFace PEFT adapters directly to HF output.
+# backend=megatron_bridge runs Megatron-Bridge's merge_lora.py, then exports the
+# merged Megatron checkpoint to HF when export_hf=true.
+#
+# HF PEFT usage:
+#   nemotron steps run convert/merge_lora -c default \
+#     lora_checkpoint=/path/to/adapter-or-checkpoint-root \
+#     base_hf_path=/path/to/base_hf \
+#     output_hf_path=/path/to/merged_hf
+#
+# Megatron-Bridge usage:
+#   nemotron steps run convert/merge_lora -c default backend=megatron_bridge \
+#     lora_checkpoint=/path/to/lora_megatron \
+#     base_megatron_path=/path/to/base_megatron \
+#     hf_model_id=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16 \
+#     output_megatron_path=/path/to/merged_megatron \
+#     output_hf_path=/path/to/merged_hf
+
+backend: auto
+
+lora_checkpoint: null
+base_hf_path: null
+output_hf_path: ${oc.env:CONVERT_OUTPUT_DIR,${oc.env:NEMO_RUN_DIR,${oc.env:PWD}/output}/convert}/merge_lora/merged-hf
+
+trust_remote_code: true
+torch_dtype: auto
+device_map: null
+low_cpu_mem_usage: true
+safe_serialization: true
+save_tokenizer: true
+
+# Megatron-Bridge backend options.
+upstream_script: /opt/Megatron-Bridge/examples/peft/merge_lora.py
+hf_model_id: null
+hf_model_path: null
+base_megatron_path: null
+output_megatron_path: ${oc.env:CONVERT_OUTPUT_DIR,${oc.env:NEMO_RUN_DIR,${oc.env:PWD}/output}/convert}/merge_lora/merged-megatron
+export_hf: true
+
+cpu: true
+tp: 1
+pp: 1
+ep: 1
+nproc_per_node: 1
+debug: false
+
+show_progress: true
+strict: true
diff --git a/src/nemotron/steps/convert/merge_lora/step.py b/src/nemotron/steps/convert/merge_lora/step.py
index ea9916758..f34647a54 100644
--- a/src/nemotron/steps/convert/merge_lora/step.py
+++ b/src/nemotron/steps/convert/merge_lora/step.py
@@ -3,21 +3,48 @@
 # [tool.runspec]
 # schema = "1"
 # name = "steps/convert/merge_lora"
+# image = "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano"
 #
 # [tool.runspec.run]
 # launch = "python"
+#
+# [tool.runspec.config]
+# dir = "./config"
+# default = "default"
+# format = "omegaconf"
+#
+# [tool.runspec.resources]
+# nodes = 1
+# gpus_per_node = 8
 # ///
-"""Merge LoRA into a base model. Real script: Megatron-Bridge/examples/peft/merge_lora.py"""
-from subprocess import run
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Merge LoRA adapters into standalone checkpoints."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from nemotron.steps._runners.convert import run_merge_lora
+
+DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 
 
 def main() -> None:
-    run(
-        ["python", "Megatron-Bridge/examples/peft/merge_lora.py", "--lora-checkpoint", "/path/to/lora_ckpt",
-         "--hf-model-path", "nvidia/Nemotron-3-Nano-30B-A3B", "--output", "/path/to/merged_checkpoint",
-         "--pretrained", "/path/to/base_ckpt", "--cpu"],
-        check=True,
-    )
+    run_merge_lora(DEFAULT_CONFIG)
 
 
 if __name__ == "__main__":
diff --git a/src/nemotron/steps/convert/merge_lora/step.toml b/src/nemotron/steps/convert/merge_lora/step.toml
index a87697ea5..3dabf9c8a 100644
--- a/src/nemotron/steps/convert/merge_lora/step.toml
+++ b/src/nemotron/steps/convert/merge_lora/step.toml
@@ -16,7 +16,7 @@
 id = "convert/merge_lora"
 name = "Merge LoRA Adapter"
 category = "convert"
-description = "Merge a LoRA adapter into the base model to produce a standalone HuggingFace checkpoint."
+description = "Merge a LoRA adapter into its original base model, producing a standalone HuggingFace checkpoint."
 tags = ["conversion", "lora", "peft", "merge"]
 
 [[consumes]]
@@ -25,11 +25,98 @@ description = "LoRA adapter weights"
 
 [[consumes]]
 type = "checkpoint_hf"
-description = "Base model in HuggingFace format"
+description = "Base model or HF config source for adapter merge"
+
+[[consumes]]
+type = "checkpoint_megatron"
+description = "Base Megatron checkpoint for Megatron-Bridge adapter merge"
+required = false
 
 [[produces]]
 type = "checkpoint_hf"
 description = "Merged model checkpoint"
 
+[[produces]]
+type = "checkpoint_megatron"
+description = "Merged Megatron checkpoint when backend=megatron_bridge"
+required = false
+
+[[parameters]]
+name = "backend"
+description = "Adapter merge backend. auto selects megatron_bridge when base_megatron_path is set, otherwise hf_peft."
+default = "auto"
+choices = ["auto", "hf_peft", "megatron_bridge"]
+
+[[parameters]]
+name = "lora_checkpoint"
+description = "Adapter checkpoint path produced by a PEFT step."
+
+[[parameters]]
+name = "base_hf_path"
+description = "Original HF base checkpoint for backend=hf_peft, or HF config/model source for backend=megatron_bridge."
+
+[[parameters]]
+name = "base_megatron_path"
+description = "Original dense Megatron checkpoint for backend=megatron_bridge. Do not substitute a different base."
+
+[[parameters]]
+name = "hf_model_id"
+description = "HF model id or path used to reconstruct architecture when exporting a Megatron-Bridge merge to HF."
+
+[[parameters]]
+name = "output_hf_path"
+description = "Directory for the merged standalone HF checkpoint."
+
+[[parameters]]
+name = "output_megatron_path"
+description = "Directory for the merged Megatron checkpoint when backend=megatron_bridge."
+
+[[parameters]]
+name = "cpu"
+description = "Merge on CPU when GPU memory is tight or when running outside a training container."
+default = true
+
+[[parameters]]
+name = "tp"
+description = "Tensor parallel size for Megatron-Bridge merge."
+default = 1
+
+[[parameters]]
+name = "pp"
+description = "Pipeline parallel size for Megatron-Bridge merge."
+default = 1
+
+[[parameters]]
+name = "ep"
+description = "Expert parallel size for Megatron-Bridge merge."
+default = 1
+
+[[strategies]]
+when = "The adapter came from peft/automodel"
+then = "Use backend=hf_peft and merge directly into the same HF base model used for PEFT training."
+
+[[strategies]]
+when = "The adapter came from peft/megatron_bridge"
+then = "Use backend=megatron_bridge with the original dense Megatron base, then export the merged checkpoint to HF."
+
+[[strategies]]
+when = "Quality is being compared before and after merge"
+then = "Evaluate the adapter-loaded model and the merged checkpoint separately; do not assume scores are identical."
+
+[[errors]]
+name = "base_model_mismatch"
+recovery = "Merge only into the exact base checkpoint used during adapter training, including tokenizer and chat-template assumptions."
+
+[[errors]]
+name = "adapter_not_validated"
+recovery = "Run a small adapter-loaded generation or eval before producing the merged deployment artifact."
+
+[[errors]]
+name = "output_path_overlaps_inputs"
+recovery = "Write output_hf_path to a fresh directory so failed merges cannot overwrite the base or adapter."
+
 [reference]
+skill = "src/nemotron/steps/convert/merge_lora/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/examples/peft/merge_lora.py"
+hf_peft_library = "https://github.com/huggingface/peft"
+skills = ["skills/nemotron-customize/references/context/checkpoint-conversion.txt"]
diff --git a/src/nemotron/steps/curate/nemo_curator/SKILL.md b/src/nemotron/steps/curate/nemo_curator/SKILL.md
index 367afe638..be6f44c50 100644
--- a/src/nemotron/steps/curate/nemo_curator/SKILL.md
+++ b/src/nemotron/steps/curate/nemo_curator/SKILL.md
@@ -1,37 +1,61 @@
 ---
 name: nemotron-curate-nemo-curator
-description: Configure Nemotron curate/nemo_curator to acquire and filter web/public text data into filtered_jsonl with language and domain annotations. Use for Common Crawl, Wikipedia, custom URL lists, or local JSONL/Parquet inputs that need quality, language, or domain gating before training.
+description: Configure Nemotron curate/nemo_curator to read JSONL text, optionally hydrate a Hugging Face dataset snapshot, apply light NeMo Curator language, word-count, and domain filters, and write filtered_jsonl for translate or data_prep steps.
 ---
 
-# Data Acquisition & Curation (NeMo Curator)
+# Lightweight Text Curation (NeMo Curator)
 
-Use `curate/nemo_curator` to turn raw or remote text into `filtered_jsonl`
-with language and domain annotations downstream training can rely on.
+Use `curate/nemo_curator` to turn JSONL text into `filtered_jsonl` that can
+feed translation, pretraining prep, or SFT prep.
 
 Read `step.toml` for the full strategy/error matrix.
 
+## Current runner
+
+The step is intentionally small:
+
+`JsonlReader -> optional FastText language filter -> optional WordCountFilter -> optional MultilingualDomainClassifier -> JsonlWriter`
+
+It can call `huggingface_hub.snapshot_download` before reading if `dataset` is
+set in YAML. It does not implement Common Crawl extraction, URL crawling, or
+deduplication itself; use a dedicated Curator recipe for those before this step
+or add them as a separate step.
+
 ## Inputs and outputs
 
-- Consume: external sources (Common Crawl, Wikipedia, URL lists) or local
-  JSONL/Parquet. The step doesn't declare an explicit `[[consumes]]` because
-  acquisition is the first step in many pipelines.
-- Produce: `filtered_jsonl` (with language/domain annotations).
+- Consume: `raw_jsonl` files matched by `input_glob`. If `dataset` is set, the
+  Hugging Face snapshot is downloaded first and `input_glob` should point into
+  that local snapshot.
+- Produce: JSONL shards under `output_dir`. Language/domain fields appear only
+  when the corresponding filters are enabled.
 
 ## Configure
 
-- **Pick the entry stage based on data location:**
-  - Remote public sources → Curator download/extract composite stages.
-  - Local JSONL/Parquet → `JsonlReader` / `ParquetReader`, skip acquisition.
-- **Stack filters in this order**: heuristic ScoreFilter → QualityClassifier
-  → deduplication. Don't reverse the order — quality classifiers are
-  expensive and benefit from heuristic prefiltering.
-- **Multilingual gating** needs FastText `lid.176` (the `missing_language_model`
-  error recovery is "download lid.176 and wire its path"). Have the path ready
-  before enabling language filters.
-- **Domain gating**: FastText language ID first, then `DomainClassifier` /
-  `MultilingualDomainClassifier`.
+- Set `input_glob`, `output_dir`, and `text_field` first.
+- Set `dataset: null` for local files. Set `dataset.repo_id`,
+  `dataset.repo_type`, `dataset.local_dir`, and optional `allow_patterns` for a
+  Hugging Face snapshot.
+- Set `language_codes: []` to skip FastText language filtering. If non-empty,
+  provide `models.fasttext_langid`.
+- Set `quality_filters: {}` to skip word-count filters. If either `min_words`
+  or `max_words` is set, set both.
+- Set `domains: []` to skip domain classification. If non-empty, provide
+  `models.hf_cache_dir` when you need a persistent model cache.
+- On small CPU Lepton runs, use the Curator container as-is and set
+  `NEMOTRON_CURATOR_RAY_NUM_CPUS=4` through the env profile when the YAML does
+  not include `ray.num_cpus`.
 - Reference [src/nemotron/steps/patterns/data-quality-before-quantity.md](../../patterns/data-quality-before-quantity.md)
-  before tuning `num_records` upward.
+  before scaling corpus size or tightening filters.
+
+## Smoke commands
+
+```bash
+uv run nemotron steps run curate/nemo_curator -c tiny -r lepton_curate
+```
+
+```bash
+uv run lep log get -j curate-nemo-curator-step-xxxx --limit 300
+```
 
 ## Local files
 
@@ -41,9 +65,9 @@ Read `step.toml` for the full strategy/error matrix.
 
 ## Guardrails
 
-- Don't enable everything at once. Filter with heuristics first; classifiers
-  and dedupe come after the corpus is small enough to iterate on.
+- Don't enable every optional filter on the first run. Start with `tiny` or
+  local JSONL plus no filters, then add language, word-count, and domain gates.
 - Inspect intermediate JSONL when output is empty or tiny — usually a filter
   is set too aggressively.
-- Split very large input files before reading; OOMs come from oversized
-  partitions, not Curator itself.
+- Split very large input files before reading; OOMs usually come from oversized
+  partitions.
diff --git a/src/nemotron/steps/curate/nemo_curator/config/default.yaml b/src/nemotron/steps/curate/nemo_curator/config/default.yaml
index 1c1c7b8ed..d58da1919 100644
--- a/src/nemotron/steps/curate/nemo_curator/config/default.yaml
+++ b/src/nemotron/steps/curate/nemo_curator/config/default.yaml
@@ -1,6 +1,8 @@
-# Starter config for data acquisition + curation with NeMo Curator.
-# Downloads a dataset snapshot from Hugging Face, keeps selected languages,
-# applies lightweight quality filters, then annotates/filter by domain.
+# Starter config for lightweight JSONL curation with NeMo Curator.
+# If dataset is non-null, snapshot_download runs first and input_glob should
+# point into dataset.local_dir. Set dataset: null for local JSONL-only runs.
+# Optional filters are controlled by language_codes, domains, and
+# quality_filters. Empty values disable those stages.
 
 language_codes:
   - EN
diff --git a/src/nemotron/steps/curate/nemo_curator/config/tiny.yaml b/src/nemotron/steps/curate/nemo_curator/config/tiny.yaml
new file mode 100644
index 000000000..cf3a3534a
--- /dev/null
+++ b/src/nemotron/steps/curate/nemo_curator/config/tiny.yaml
@@ -0,0 +1,16 @@
+# Tiny NeMo Curator smoke for Lepton using the Curator container as-is.
+# Optional filters are disabled so this verifies reader, writer, Ray startup,
+# and the packaged tiny JSONL path without downloading models.
+
+language_codes: []
+
+domains: []
+text_field: text
+input_glob: /nemo_run/code/src/nemotron/steps/curate/nemo_curator/data/tiny.jsonl
+output_dir: /mnt/lustre-shared/output/test/curate_nemo_curator_tiny
+
+dataset: null
+
+models: {}
+
+quality_filters: {}
diff --git a/src/nemotron/steps/curate/nemo_curator/data/tiny.jsonl b/src/nemotron/steps/curate/nemo_curator/data/tiny.jsonl
new file mode 100644
index 000000000..373830cc8
--- /dev/null
+++ b/src/nemotron/steps/curate/nemo_curator/data/tiny.jsonl
@@ -0,0 +1,2 @@
+{"text":"This is a small English document for the curation smoke test with enough words for optional quality filters."}
+{"text":"Another English sample about mathematics, banking, and clean data processing for a tiny curation validation run."}
diff --git a/src/nemotron/steps/curate/nemo_curator/step.py b/src/nemotron/steps/curate/nemo_curator/step.py
index 3dbc3b6a2..9e8307bd0 100644
--- a/src/nemotron/steps/curate/nemo_curator/step.py
+++ b/src/nemotron/steps/curate/nemo_curator/step.py
@@ -24,11 +24,12 @@
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 
-"""Data acquisition and curation via NeMo Curator — reference implementation."""
+"""Lightweight JSONL curation via NeMo Curator."""
 
 from __future__ import annotations
 
 import argparse
+import os
 from ast import literal_eval
 from pathlib import Path
 
@@ -36,10 +37,6 @@
 from huggingface_hub import snapshot_download
 from nemo_curator.core.client import RayClient
 from nemo_curator.pipeline import Pipeline
-from nemo_curator.stages.text.classifiers import MultilingualDomainClassifier
-from nemo_curator.stages.text.filters import Filter, ScoreFilter
-from nemo_curator.stages.text.filters.fasttext import FastTextLangId
-from nemo_curator.stages.text.filters.heuristic.string import WordCountFilter
 from nemo_curator.stages.text.io.reader import JsonlReader
 from nemo_curator.stages.text.io.writer import JsonlWriter
 
@@ -51,52 +48,86 @@ def keep_language(value: str, allowed: set[str]) -> bool:
     return lang_code in allowed and score >= 0.0
 
 
+def ray_client_kwargs(cfg: dict) -> dict:
+    kwargs = dict(cfg.get("ray") or {})
+    if "num_cpus" not in kwargs and os.environ.get("NEMOTRON_CURATOR_RAY_NUM_CPUS"):
+        kwargs["num_cpus"] = int(os.environ["NEMOTRON_CURATOR_RAY_NUM_CPUS"])
+    return kwargs
+
+
+def text_filter_stages():
+    """Return Filter/ScoreFilter across supported NeMo Curator releases."""
+    try:
+        from nemo_curator.stages.text.modules import Filter, ScoreFilter
+    except ImportError:
+        from nemo_curator.stages.text.filters import Filter, ScoreFilter
+    return Filter, ScoreFilter
+
+
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Acquire and curate text with NeMo Curator")
+    parser = argparse.ArgumentParser(description="Curate JSONL text with NeMo Curator")
     parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG)
     args = parser.parse_args()
     cfg = yaml.safe_load(args.config.read_text())
 
-    snapshot_download(**cfg["dataset"])
-    allowed_languages = {code.upper() for code in cfg["language_codes"]}
+    if cfg.get("dataset"):
+        snapshot_download(**cfg["dataset"])
+    allowed_languages = {code.upper() for code in cfg.get("language_codes") or []}
+    models = cfg.get("models") or {}
+    quality_filters = cfg.get("quality_filters") or {}
 
     pipeline = Pipeline(name="curate_nemo_curator")
     pipeline.add_stage(JsonlReader(file_paths=cfg["input_glob"], fields=[cfg["text_field"]]))
-    pipeline.add_stage(
-        ScoreFilter(
-            FastTextLangId(
-                model_path=cfg["models"]["fasttext_langid"],
-                min_langid_score=cfg["quality_filters"]["min_langid_score"],
-            ),
-            text_field=cfg["text_field"],
-            score_field="language",
+    if allowed_languages:
+        Filter, ScoreFilter = text_filter_stages()
+        from nemo_curator.stages.text.filters.fasttext import FastTextLangId
+
+        pipeline.add_stage(
+            ScoreFilter(
+                FastTextLangId(
+                    model_path=models["fasttext_langid"],
+                    min_langid_score=quality_filters.get("min_langid_score", 0.0),
+                ),
+                text_field=cfg["text_field"],
+                score_field="language",
+            )
         )
-    )
-    pipeline.add_stage(
-        Filter(
-            filter_fn=lambda value: keep_language(value, allowed_languages),
-            filter_field="language",
+        pipeline.add_stage(
+            Filter(
+                filter_fn=lambda value: keep_language(value, allowed_languages),
+                filter_field="language",
+            )
         )
-    )
-    pipeline.add_stage(
-        ScoreFilter(
-            WordCountFilter(
-                min_words=cfg["quality_filters"]["min_words"],
-                max_words=cfg["quality_filters"]["max_words"],
-            ),
-            text_field=cfg["text_field"],
+
+    has_word_filter = any(key in quality_filters for key in ("min_words", "max_words"))
+    if has_word_filter:
+        if not all(key in quality_filters for key in ("min_words", "max_words")):
+            raise ValueError("quality_filters must set both min_words and max_words to enable WordCountFilter")
+        _, ScoreFilter = text_filter_stages()
+        from nemo_curator.stages.text.filters.heuristic import WordCountFilter
+
+        pipeline.add_stage(
+            ScoreFilter(
+                WordCountFilter(
+                    min_words=quality_filters["min_words"],
+                    max_words=quality_filters["max_words"],
+                ),
+                text_field=cfg["text_field"],
+            )
         )
-    )
-    pipeline.add_stage(
-        MultilingualDomainClassifier(
-            text_field=cfg["text_field"],
-            filter_by=cfg.get("domains") or None,
-            cache_dir=cfg["models"].get("hf_cache_dir"),
+    if cfg.get("domains"):
+        from nemo_curator.stages.text.classifiers import MultilingualDomainClassifier
+
+        pipeline.add_stage(
+            MultilingualDomainClassifier(
+                text_field=cfg["text_field"],
+                filter_by=cfg["domains"],
+                cache_dir=models.get("hf_cache_dir"),
+            )
         )
-    )
     pipeline.add_stage(JsonlWriter(path=cfg["output_dir"]))
 
-    ray_client = RayClient()
+    ray_client = RayClient(**ray_client_kwargs(cfg))
     ray_client.start()
     try:
         pipeline.run()
diff --git a/src/nemotron/steps/curate/nemo_curator/step.toml b/src/nemotron/steps/curate/nemo_curator/step.toml
index fd325cf2a..38180caaf 100644
--- a/src/nemotron/steps/curate/nemo_curator/step.toml
+++ b/src/nemotron/steps/curate/nemo_curator/step.toml
@@ -14,58 +14,109 @@
 
 [step]
 id = "curate/nemo_curator"
-name = "Data Acquisition & Curation (NeMo Curator)"
+name = "Lightweight Text Curation (NeMo Curator)"
 category = "curate"
-description = """Acquire public or custom text corpora with NeMo Curator, then annotate and filter them by language, domain, and quality to produce downstream-ready JSONL."""
-tags = ["curate", "data-acquisition", "nemo-curator", "jsonl", "ray"]
+description = """Read JSONL text with NeMo Curator, optionally hydrate a Hugging Face snapshot, apply light language, word-count, and domain filters, and write downstream-ready JSONL."""
+tags = ["curate", "nemo-curator", "jsonl", "ray", "huggingface"]
+
+[[consumes]]
+type = "raw_jsonl"
+description = "Raw local JSONL records, or a Hugging Face dataset snapshot materialized before Curator reads input_glob."
 
 [[produces]]
 type = "filtered_jsonl"
-description = "Filtered JSONL records with language/domain annotations suitable for downstream translation or training."
+description = "Filtered JSONL records suitable for downstream translation, pretraining prep, or SFT prep. Language/domain annotations are present only when those filters are enabled."
+
+[[parameters]]
+name = "input_glob"
+description = "JSONL file path or glob passed to Curator JsonlReader."
+
+[[parameters]]
+name = "output_dir"
+description = "Directory where Curator writes JSONL output shards."
+
+[[parameters]]
+name = "text_field"
+description = "Record field containing the text to curate."
+default = "text"
+
+[[parameters]]
+name = "dataset"
+description = "Optional Hugging Face snapshot_download kwargs. Set to null for local input_glob-only runs."
+default = "null"
 
 [[parameters]]
 name = "language_codes"
-description = "Target languages to keep during language identification and filtering."
-default = ["en"]
+description = "Uppercase language codes to keep. Set [] to skip FastText language identification and filtering."
+default = ["EN"]
 
 [[parameters]]
 name = "domains"
-description = "Requested content domains to keep or prioritize during domain classification and export."
+description = "Domains to keep through MultilingualDomainClassifier. Set [] to skip domain classification."
 default = []
 
 [[parameters]]
-name = "num_records"
-description = "Approximate number of records to acquire and retain after filtering."
-default = 100000
+name = "quality_filters"
+description = "Optional quality settings. min_langid_score applies when language filtering is enabled; min_words and max_words enable WordCountFilter and must be provided together."
+default = { min_langid_score = 0.3, min_words = 50, max_words = 5000 }
+
+[[parameters]]
+name = "models"
+description = "Optional model/cache paths such as fasttext_langid and hf_cache_dir. fasttext_langid is required when language_codes is non-empty."
+
+[[parameters]]
+name = "ray.num_cpus"
+description = "Optional RayClient CPU count. If omitted, the Lepton curate profile can provide NEMOTRON_CURATOR_RAY_NUM_CPUS."
 
 [[strategies]]
-when = "You are starting from remote public sources such as Common Crawl, Wikipedia, or custom URL lists"
-then = "Use Curator's download/extract composite stages first, then write JSONL for downstream filtering and export."
+when = "You need a quick infrastructure smoke test"
+then = "Use config/tiny.yaml with the lepton_curate profile. It reads a packaged tiny JSONL, disables optional filters, and uses the Curator container as-is."
 
 [[strategies]]
-when = "You already have local JSONL or Parquet data"
-then = "Skip remote acquisition and start with JsonlReader or ParquetReader, then apply language, quality, and domain filters."
+when = "You already have local JSONL data"
+then = "Set dataset=null, point input_glob at the JSONL files, and start with language_codes=[], domains=[], and quality_filters={} until the reader/writer path is verified."
 
 [[strategies]]
-when = "You need multilingual or domain-specific gating before training"
-then = "Apply FastText language ID plus DomainClassifier or MultilingualDomainClassifier before final export."
+when = "You want to pull a Hugging Face dataset snapshot first"
+then = "Set dataset.repo_id, dataset.repo_type, dataset.local_dir, and allow_patterns, then point input_glob into dataset.local_dir."
 
 [[strategies]]
-when = "Raw web data quality is noisy"
-then = "Start with heuristic ScoreFilter stages, then add QualityClassifier and deduplication for production-scale corpora."
+when = "You need language gating"
+then = "Set language_codes and models.fasttext_langid. Use quality_filters.min_langid_score to tune the language confidence threshold."
+
+[[strategies]]
+when = "You need a simple text quality gate"
+then = "Set both quality_filters.min_words and quality_filters.max_words to enable WordCountFilter."
+
+[[strategies]]
+when = "You need domain gating"
+then = "Set domains and optionally models.hf_cache_dir. Keep the first run small because the classifier can download/cache model assets."
 
 [[errors]]
 name = "empty_or_tiny_output"
-recovery = "Relax language/domain/quality filters, increase num_records, or inspect intermediate JSONL before continuing downstream."
+recovery = "Relax language/domain/word-count filters or inspect the input_glob records before continuing downstream."
 
 [[errors]]
-name = "large_file_oom"
-recovery = "Split very large JSONL or Parquet files into smaller chunks and tune files_per_partition or blocksize before reading them with Curator."
+name = "input_glob_no_matches"
+recovery = "Check that input_glob exists inside the container. For packaged tiny data use /nemo_run/code/src/nemotron/steps/curate/nemo_curator/data/tiny.jsonl."
 
 [[errors]]
 name = "missing_language_model"
-recovery = "Download the FastText lid.176 model and wire its path into FastTextLangId before enabling language-based filtering."
+recovery = "Set language_codes=[] to disable language filtering, or provide models.fasttext_langid pointing at the FastText lid.176 model."
+
+[[errors]]
+name = "incomplete_word_filter"
+recovery = "Set both quality_filters.min_words and quality_filters.max_words, or remove both keys to skip WordCountFilter."
+
+[[errors]]
+name = "not_enough_cpu_resources"
+recovery = "Set ray.num_cpus in YAML or NEMOTRON_CURATOR_RAY_NUM_CPUS in the env profile. The tiny Lepton profile uses 4 CPUs on cpu.medium."
+
+[[errors]]
+name = "large_file_oom"
+recovery = "Split very large JSONL files into smaller shards before reading them with Curator."
 
 [reference]
-script = "https://github.com/NVIDIA-NeMo/Curator/blob/main/nemo_curator/stages/text/download/common_crawl/stage.py"
+skill = "src/nemotron/steps/curate/nemo_curator/SKILL.md"
+script = "https://github.com/NVIDIA-NeMo/Curator/tree/main/nemo_curator/stages/text"
 docs = "https://docs.nvidia.com/nemo/curator/latest/"
diff --git a/src/nemotron/steps/prep/SKILL.md b/src/nemotron/steps/data_prep/SKILL.md
similarity index 55%
rename from src/nemotron/steps/prep/SKILL.md
rename to src/nemotron/steps/data_prep/SKILL.md
index 7cb192f5e..9a827d914 100644
--- a/src/nemotron/steps/prep/SKILL.md
+++ b/src/nemotron/steps/data_prep/SKILL.md
@@ -1,11 +1,11 @@
 ---
-name: nemotron-prep
-description: Navigate Nemotron data preparation steps for SFT packing, pretraining bin/idx tokenization, and RL prompt or preference sharding. Use when choosing, configuring, validating, or chaining prep steps before pretrain, CPT, SFT, PEFT, DPO, RLVR, or RLHF training, including sovereign customizations where blend composition decides downstream behavior.
+name: nemotron-data-prep
+description: Navigate Nemotron data preparation steps for SFT packing, pretraining bin/idx tokenization, and RL prompt or preference sharding. Use when choosing, configuring, validating, or chaining data_prep steps before pretrain, CPT, SFT, PEFT, DPO, RLVR, or RLHF training, including sovereign customizations where blend composition decides downstream behavior.
 ---
 
-# Nemotron Prep
+# Nemotron Data Prep
 
-Pick a prep step, lock its outputs to a tokenizer, and keep the prepared
+Pick a data_prep step, lock its outputs to a tokenizer, and keep the prepared
 artifact compatible with the downstream trainer. Prepared data is a
 **versioned data product** — name it after the (tokenizer, template, pack_size)
 tuple, not after the date.
@@ -14,11 +14,11 @@ tuple, not after the date.
 
 | Need | Step | Produces |
 |---|---|---|
-| Pack chat JSONL for Megatron-Bridge SFT or PEFT | [`prep/sft_packing`](sft_packing/SKILL.md) | `packed_parquet` |
-| Tokenize text into Megatron pretraining shards | [`prep/pretrain_prep`](pretrain_prep/SKILL.md) | `binidx` + `blend.json` |
-| Resolve and shard RL prompt or preference data | [`prep/rl_prep`](rl_prep/SKILL.md) | `training_jsonl` (sharded) |
+| Pack chat JSONL for Megatron-Bridge SFT or PEFT | [`data_prep/sft_packing`](sft_packing/SKILL.md) | `packed_parquet` |
+| Tokenize text into Megatron pretraining shards | [`data_prep/pretrain_prep`](pretrain_prep/SKILL.md) | `binidx` + `blend.json` |
+| Resolve and shard RL prompt or preference data | [`data_prep/rl_prep`](rl_prep/SKILL.md) | `training_jsonl` (sharded) |
 
-## When to use `prep/sft_packing`
+## When to use `data_prep/sft_packing`
 
 | Downstream trainer | Packing required? | Why |
 |---|---|---|
@@ -32,25 +32,45 @@ Skip packing when:
 
 ## Workflow
 
-1. **Env profile first** — verify the env profile for Lepton/Slurm/Ray/batch
-   runs (`env.toml` by default, or `NEMOTRON_ENV_FILE` for backend-specific
-   files).
-2. Read the target step's `step.toml` for artifacts, parameters, strategies,
+1. Read the target step's `step.toml` for artifacts, parameters, strategies,
    and references.
-3. Start with `config/tiny.yaml` for smoke tests, `config/default.yaml` for
+2. Start with `config/tiny.yaml` for smoke tests, `config/default.yaml` for
    production shape.
-4. Keep tokenizer, chat template, sequence length, split names, and shard
+3. Keep tokenizer, chat template, sequence length, split names, and shard
    policy aligned with the downstream trainer.
+4. For remote submission, select the profile from
+   `env/env_toml/config/{lepton,slurm,dgxcloud}.yaml` or the generated env file;
+   do not hardcode profile names here.
 5. Inspect sample outputs before launching expensive training.
 
 ## Smoke commands
 
 ```bash
-nemotron step run prep/sft_packing   -c tiny
-nemotron step run prep/pretrain_prep -c tiny
-nemotron step run prep/rl_prep       -c tiny
+uv run nemotron steps run data_prep/sft_packing   -c tiny --dry-run
+uv run nemotron steps run data_prep/pretrain_prep -c tiny --dry-run
+uv run nemotron steps run data_prep/rl_prep       -c tiny --dry-run
 ```
 
+## Project layout for generated configs
+
+Keep every generated overlay config and any supporting code under a single
+self-contained project root that also holds the local input data, so the
+whole directory is rsync/scp-portable to the remote machine that will run
+the data_prep step.
+
+- `<project>/config/` for generated YAML — never write into
+  `src/nemotron/steps/data_prep/<step>/config/`; the shipped `default.yaml`
+  and `tiny.yaml` stay as catalog references.
+- `<project>/data/` for source blends (`blend.json`), local JSONL inputs,
+  and the prepared artifact destination (packed Parquet shards, bin/idx +
+  emitted `blend.json`, or RL JSONL splits).
+- Tokenizer, chat-template, and `pack_size` / `seq_length` metadata should
+  be captured under the same project root so downstream training can be
+  shipped together as one portable bundle.
+- Project-root scripts only when catalog code cannot serve the request.
+- Do not split generated files into home dirs, scratch dirs, or paths
+  outside the project root that will not ship with the bundle.
+
 ## Patterns to cite
 
 - [../patterns/prep-data-is-tokenizer-locked.md](../patterns/prep-data-is-tokenizer-locked.md) — repack on tokenizer / template / seq_length changes.
diff --git a/src/nemotron/steps/data_prep/__init__.py b/src/nemotron/steps/data_prep/__init__.py
new file mode 100644
index 000000000..625cdea54
--- /dev/null
+++ b/src/nemotron/steps/data_prep/__init__.py
@@ -0,0 +1 @@
+"""Data preparation step category."""
diff --git a/src/nemotron/steps/prep/_common.py b/src/nemotron/steps/data_prep/_common.py
similarity index 100%
rename from src/nemotron/steps/prep/_common.py
rename to src/nemotron/steps/data_prep/_common.py
diff --git a/src/nemotron/steps/prep/guide.md b/src/nemotron/steps/data_prep/guide.md
similarity index 94%
rename from src/nemotron/steps/prep/guide.md
rename to src/nemotron/steps/data_prep/guide.md
index ad1e56270..4767b71b6 100644
--- a/src/nemotron/steps/prep/guide.md
+++ b/src/nemotron/steps/data_prep/guide.md
@@ -1,6 +1,6 @@
 # Data Prep — When You Need `sft_packing`
 
-| Downstream trainer | Need `prep/sft_packing`? | Why |
+| Downstream trainer | Need `data_prep/sft_packing`? | Why |
 |---|---|---|
 | `sft/megatron_bridge` | **Yes** | Megatron-Bridge SFT expects `packed_parquet`, not raw JSONL. |
 | AutoModel / HuggingFace-style SFT | **No** | These workflows can read `training_jsonl` directly. |
diff --git a/src/nemotron/steps/prep/pretrain_prep/SKILL.md b/src/nemotron/steps/data_prep/pretrain_prep/SKILL.md
similarity index 62%
rename from src/nemotron/steps/prep/pretrain_prep/SKILL.md
rename to src/nemotron/steps/data_prep/pretrain_prep/SKILL.md
index fecdf747c..848327d45 100644
--- a/src/nemotron/steps/prep/pretrain_prep/SKILL.md
+++ b/src/nemotron/steps/data_prep/pretrain_prep/SKILL.md
@@ -1,11 +1,11 @@
 ---
-name: nemotron-prep-pretrain-binidx
-description: Configure the Nemotron prep/pretrain_prep step that tokenizes HF or local text blends into Megatron bin/idx shards and a blend.json for pretrain/automodel or pretrain/megatron_bridge. Use when preparing pretraining or continued-pretraining data, rebuilding tokenizer-locked corpora, or validating data splits.
+name: nemotron-data-prep-pretrain-binidx
+description: Configure the Nemotron data_prep/pretrain_prep step that tokenizes HF or local text blends into Megatron bin/idx shards and a blend.json for pretrain/automodel or pretrain/megatron_bridge. Use when preparing pretraining or continued-pretraining data, rebuilding tokenizer-locked corpora, or validating data splits.
 ---
 
 # Pretrain Bin/Idx Prep
 
-Use `prep/pretrain_prep` when downstream pretraining expects Megatron `binidx` data.
+Use `data_prep/pretrain_prep` when downstream pretraining expects Megatron `binidx` data.
 
 Before changing configs or code, read `step.toml` to understand the step flow, consumed and produced artifacts, important parameters, strategies, failure modes, and upstream references.
 
@@ -17,8 +17,12 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Configure
 
+- Set `blend_path` to the source data blend; downstream trainers should use
+  the emitted `blend.json`.
 - Set `tokenizer.model` to the downstream pretraining model tokenizer.
 - Tune `num_shards` for target filesystem and trainer throughput.
+- Keep `valid_shards`, `test_shards`, and `split_seed` explicit so validation
+  data is reproducible.
 - Leave `max_doc_tokens` unset unless the data policy requires truncation.
 - Point pretrain configs at the emitted `blend.json`.
 - Check `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md` before changing tokenization, split, or sharding behavior.
@@ -33,10 +37,10 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Local Files
 
-- Contract: `src/nemotron/steps/prep/pretrain_prep/step.toml`
-- Runner: `src/nemotron/steps/prep/pretrain_prep/step.py`
-- Configs: `src/nemotron/steps/prep/pretrain_prep/config/default.yaml`, `src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml`
-- Sample blend: `src/nemotron/steps/prep/pretrain_prep/data/blend_tiny.json`
+- Contract: `src/nemotron/steps/data_prep/pretrain_prep/step.toml`
+- Runner: `src/nemotron/steps/data_prep/pretrain_prep/step.py`
+- Configs: `src/nemotron/steps/data_prep/pretrain_prep/config/default.yaml`, `src/nemotron/steps/data_prep/pretrain_prep/config/tiny.yaml`
+- Sample blend: `src/nemotron/steps/data_prep/pretrain_prep/data/blend_tiny.json`
 
 ## Guardrails
 
diff --git a/src/nemotron/steps/prep/pretrain_prep/config/default.yaml b/src/nemotron/steps/data_prep/pretrain_prep/config/default.yaml
similarity index 100%
rename from src/nemotron/steps/prep/pretrain_prep/config/default.yaml
rename to src/nemotron/steps/data_prep/pretrain_prep/config/default.yaml
diff --git a/src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml b/src/nemotron/steps/data_prep/pretrain_prep/config/tiny.yaml
similarity index 77%
rename from src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml
rename to src/nemotron/steps/data_prep/pretrain_prep/config/tiny.yaml
index 95fd49e9b..0241b8efe 100644
--- a/src/nemotron/steps/prep/pretrain_prep/config/tiny.yaml
+++ b/src/nemotron/steps/data_prep/pretrain_prep/config/tiny.yaml
@@ -17,12 +17,12 @@
 # step's data/ dir so the same config works under any source layout.
 #
 # Usage:
-#   nemotron step run prep/pretrain_prep -c tiny                          # local
-#   nemotron step run prep/pretrain_prep -c tiny -r lepton_pretrain_dataprep
-#   nemotron step run prep/pretrain_prep -c tiny -r slurm_pretrain_dataprep
+#   nemotron steps run data_prep/pretrain_prep -c tiny                          # local
+#   nemotron steps run data_prep/pretrain_prep -c tiny -r lepton_prep_pretrain_prep
+#   nemotron steps run data_prep/pretrain_prep -c tiny -r slurm_prep_pretrain_prep
 
 # blend_path is omitted on purpose — step.py defaults to data/blend_tiny.json.
-output_dir: ${oc.env:PRETRAIN_OUTPUT_DIR,./output/pretrain_dataprep_tiny}
+output_dir: ${oc.env:PRETRAIN_OUTPUT_DIR,./output/data_prep/pretrain_prep/tiny}
 
 num_shards: 4
 valid_shards: 1
@@ -38,7 +38,8 @@ dtype: int32
 text_field: text
 
 # Cap docs / tokens for a fast smoke test.
-sample: 500
+max_rows: 1000
+sample: null
 sample_seed: 42
 force: false
 
diff --git a/src/nemotron/steps/prep/pretrain_prep/data/blend_tiny.json b/src/nemotron/steps/data_prep/pretrain_prep/data/blend_tiny.json
similarity index 100%
rename from src/nemotron/steps/prep/pretrain_prep/data/blend_tiny.json
rename to src/nemotron/steps/data_prep/pretrain_prep/data/blend_tiny.json
diff --git a/src/nemotron/steps/prep/pretrain_prep/step.py b/src/nemotron/steps/data_prep/pretrain_prep/step.py
similarity index 96%
rename from src/nemotron/steps/prep/pretrain_prep/step.py
rename to src/nemotron/steps/data_prep/pretrain_prep/step.py
index 124215c59..aeade5123 100644
--- a/src/nemotron/steps/prep/pretrain_prep/step.py
+++ b/src/nemotron/steps/data_prep/pretrain_prep/step.py
@@ -2,7 +2,7 @@
 # /// script
 # [tool.runspec]
 # schema = "1"
-# name = "steps/prep/pretrain_prep"
+# name = "steps/data_prep/pretrain_prep"
 # image = "anyscale/ray:2.49.2-py312"
 #
 # [tool.runspec.run]
@@ -35,7 +35,7 @@
 
 """Thin pretrain bin/idx wrapper. Tokenises HF/local text into Megatron bin/idx + blend.json.
 
-Mirrors the runtime safety pattern from prep/sft_packing:
+Mirrors the runtime safety pattern from data_prep/sft_packing:
   * Falls back to a self-contained ``data/blend_tiny.json`` when YAML omits
     ``blend_path``, so the same config works under any source layout
     (local / slurm /nemo_run/code / lepton /mnt/lustre-shared/_nemotron).
@@ -63,7 +63,7 @@
     load_omegaconf_yaml,
     parse_config_and_overrides,
 )
-from nemotron.steps.prep._common import (
+from nemotron.steps.data_prep._common import (
     chdir_to_scratch,
     config_dataclass,
     init_prep_wandb,
diff --git a/src/nemotron/steps/prep/pretrain_prep/step.toml b/src/nemotron/steps/data_prep/pretrain_prep/step.toml
similarity index 56%
rename from src/nemotron/steps/prep/pretrain_prep/step.toml
rename to src/nemotron/steps/data_prep/pretrain_prep/step.toml
index eca18863e..f1a5d56c6 100644
--- a/src/nemotron/steps/prep/pretrain_prep/step.toml
+++ b/src/nemotron/steps/data_prep/pretrain_prep/step.toml
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 [step]
-id = "prep/pretrain_prep"
+id = "data_prep/pretrain_prep"
 name = "Pretrain Data — bin/idx Tokenisation"
-category = "prep"
+category = "data_prep"
 description = """\
 Tokenise raw text (HF datasets or local parquet/jsonl) into Megatron bin/idx
 shards and emit a blend.json that pretrain/megatron_bridge and pretrain/automodel
 can ingest directly."""
-tags = ["prep", "pretrain", "binidx", "tokenisation", "megatron"]
+tags = ["data_prep", "pretrain", "binidx", "tokenisation", "megatron"]
 
 [[consumes]]
 type = "filtered_jsonl"
@@ -30,6 +30,10 @@ description = "Curated text data — local files or HF datasets referenced via t
 type = "binidx"
 description = "Megatron bin/idx shards plus blend.json with per-split blends."
 
+[[parameters]]
+name = "blend_path"
+description = "Pretraining data blend JSON/YAML describing local files or HF datasets. Downstream pretrain configs consume the emitted blend.json, not this source blend."
+
 [[parameters]]
 name = "tokenizer.model"
 description = "HF tokenizer id; must match the downstream pretraining model."
@@ -44,6 +48,26 @@ default = 128
 name = "max_doc_tokens"
 description = "Optional per-document truncation. Leave unset for no truncation."
 
+[[parameters]]
+name = "valid_shards"
+description = "Number of emitted bin/idx shards assigned to validation in blend.json."
+default = 1
+
+[[parameters]]
+name = "test_shards"
+description = "Number of emitted bin/idx shards assigned to test in blend.json."
+default = 1
+
+[[parameters]]
+name = "split_seed"
+description = "Seed used when assigning emitted shards to train/validation/test splits."
+default = 42
+
+[[parameters]]
+name = "text_field"
+description = "Input record field containing pretraining text."
+default = "text"
+
 [[strategies]]
 when = "Source is HuggingFace (e.g. nvidia/Nemotron-CC-v2)"
 then = "Reference the HF dataset in the blend YAML — no manual download needed."
@@ -52,14 +76,31 @@ then = "Reference the HF dataset in the blend YAML — no manual download needed
 when = "Tokenizer changes"
 then = "Repack: bin/idx is tokenizer-locked."
 
+[[strategies]]
+when = "Preparing data for continued pretraining"
+then = "Scope blend ratios and held-out splits before scaling. Preserve the emitted blend.json and point both AutoModel and Megatron-Bridge pretrain configs at it."
+
 [[errors]]
 name = "tokenizer_mismatch"
 recovery = "Match the tokenizer to the downstream pretraining model, then rerun."
 
+[[errors]]
+name = "missing_or_invalid_blend"
+recovery = "Set blend_path to a valid data blend. Validate a tiny subset before production tokenization."
+
+[[errors]]
+name = "split_leakage_or_empty_validation"
+recovery = "Keep valid_shards/test_shards explicit and inspect the emitted blend.json before launching pretraining."
+
+[[errors]]
+name = "empty_or_overlong_documents"
+recovery = "Check text_field, min_doc_chars, max_doc_tokens, and token counts on a sample before full prep."
+
 [reference]
+skill = "src/nemotron/steps/data_prep/pretrain_prep/SKILL.md"
 script = "src/nemotron/data_prep/recipes/pretrain.py"
 megatron_bridge_repo = "https://github.com/NVIDIA-NeMo/Megatron-Bridge"
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 automodel_repo = "https://github.com/NVIDIA-NeMo/Automodel"
 automodel_docs = "https://docs.nvidia.com/nemo/automodel/latest/index.html"
-skills = ["skills/nemotron-customize/context/nemotron-data-prep.txt"]
+skills = ["skills/nemotron-customize/references/context/nemotron-data-prep.txt"]
diff --git a/src/nemotron/steps/prep/rl_prep/SKILL.md b/src/nemotron/steps/data_prep/rl_prep/SKILL.md
similarity index 55%
rename from src/nemotron/steps/prep/rl_prep/SKILL.md
rename to src/nemotron/steps/data_prep/rl_prep/SKILL.md
index f23c69c70..5458a1d49 100644
--- a/src/nemotron/steps/prep/rl_prep/SKILL.md
+++ b/src/nemotron/steps/data_prep/rl_prep/SKILL.md
@@ -1,11 +1,11 @@
 ---
-name: nemotron-prep-rl
-description: Configure the Nemotron prep/rl_prep step that resolves RL data blends and shards prompt, preference, or reward-model JSONL for rl/nemo_rl DPO, RLVR, and RLHF steps. Use before NeMo-RL training when data references need materialization, canonical split layout, or schema checks.
+name: nemotron-data-prep-rl
+description: Configure the Nemotron data_prep/rl_prep step that resolves RL data blends and shards prompt, preference, or reward-model JSONL for rl/nemo_rl DPO, RLVR, and RLHF steps. Use before NeMo-RL training when data references need materialization, canonical split layout, or schema checks.
 ---
 
 # RL Prep
 
-Use `prep/rl_prep` before NeMo-RL when prompt or preference data needs HF resolution, local materialization, or split sharding.
+Use `data_prep/rl_prep` before NeMo-RL when prompt or preference data needs HF resolution, local materialization, or split sharding.
 
 Before changing configs or code, read `step.toml` to understand the step flow, consumed and produced artifacts, important parameters, strategies, failure modes, and upstream references.
 
@@ -13,12 +13,14 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Consume `training_jsonl` through an RL data blend.
 - Produce sharded `training_jsonl` ready for `rl/nemo_rl/dpo`, `rl/nemo_rl/rlvr`, or `rl/nemo_rl/rlhf`.
-- Smoke with `nemotron step run prep/rl_prep -c tiny`.
+- Smoke with `nemotron steps run data_prep/rl_prep -c tiny`.
 
 ## Configure
 
+- Set `blend_path` to the RL data blend that should be resolved and sharded.
 - Keep `resolve_hf_placeholders=true` for closed-network or production clusters.
 - Set `num_shards_per_split` to match dataset size and filesystem throughput.
+- Keep output split names aligned with the downstream RL config.
 - For DPO, ensure records include prompt, chosen, and rejected responses.
 - For RLVR, ensure each prompt carries verifier fields such as ground-truth answers.
 - For RLHF, ensure prompt data and reward-model references are handled separately.
@@ -27,10 +29,10 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Local Files
 
-- Contract: `src/nemotron/steps/prep/rl_prep/step.toml`
-- Runner: `src/nemotron/steps/prep/rl_prep/step.py`
-- Configs: `src/nemotron/steps/prep/rl_prep/config/default.yaml`, `src/nemotron/steps/prep/rl_prep/config/tiny.yaml`
-- Sample blend: `src/nemotron/steps/prep/rl_prep/data/blend_tiny.json`
+- Contract: `src/nemotron/steps/data_prep/rl_prep/step.toml`
+- Runner: `src/nemotron/steps/data_prep/rl_prep/step.py`
+- Configs: `src/nemotron/steps/data_prep/rl_prep/config/default.yaml`, `src/nemotron/steps/data_prep/rl_prep/config/tiny.yaml`
+- Sample blend: `src/nemotron/steps/data_prep/rl_prep/data/blend_tiny.json`
 
 ## Guardrails
 
diff --git a/src/nemotron/steps/prep/rl_prep/config/default.yaml b/src/nemotron/steps/data_prep/rl_prep/config/default.yaml
similarity index 95%
rename from src/nemotron/steps/prep/rl_prep/config/default.yaml
rename to src/nemotron/steps/data_prep/rl_prep/config/default.yaml
index f5c513b91..9f8d4a8a2 100644
--- a/src/nemotron/steps/prep/rl_prep/config/default.yaml
+++ b/src/nemotron/steps/data_prep/rl_prep/config/default.yaml
@@ -15,7 +15,7 @@
 # RL data prep — resolve HF references and shard. HF-dataset compatible via the blend.
 
 blend_path: /path/to/rl_blend.json
-output_dir: /nemo_run/rl-prep
+output_dir: /nemo_run/data_prep/rl_prep
 num_shards_per_split: 1
 compression: none
 resolve_hf_placeholders: true
diff --git a/src/nemotron/steps/prep/rl_prep/config/tiny.yaml b/src/nemotron/steps/data_prep/rl_prep/config/tiny.yaml
similarity index 80%
rename from src/nemotron/steps/prep/rl_prep/config/tiny.yaml
rename to src/nemotron/steps/data_prep/rl_prep/config/tiny.yaml
index 1d676bc5b..bf623a74d 100644
--- a/src/nemotron/steps/prep/rl_prep/config/tiny.yaml
+++ b/src/nemotron/steps/data_prep/rl_prep/config/tiny.yaml
@@ -14,10 +14,11 @@
 
 defaults: default.yaml
 
-blend_path: ${oc.env:PWD}/src/nemotron/steps/prep/rl_prep/data/blend_tiny.json
-output_dir: ${oc.env:RL_OUTPUT_DIR,./output/rl-prep-tiny}
+blend_path: ${oc.env:PWD}/src/nemotron/steps/data_prep/rl_prep/data/blend_tiny.json
+output_dir: ${oc.env:RL_OUTPUT_DIR,./output/data_prep/rl_prep/tiny}
 num_shards_per_split: 1
-sample: 64
+max_rows: 1000
+sample: null
 force: false
 
 observability:
diff --git a/src/nemotron/steps/prep/rl_prep/data/blend_tiny.json b/src/nemotron/steps/data_prep/rl_prep/data/blend_tiny.json
similarity index 100%
rename from src/nemotron/steps/prep/rl_prep/data/blend_tiny.json
rename to src/nemotron/steps/data_prep/rl_prep/data/blend_tiny.json
diff --git a/src/nemotron/steps/prep/rl_prep/step.py b/src/nemotron/steps/data_prep/rl_prep/step.py
similarity index 92%
rename from src/nemotron/steps/prep/rl_prep/step.py
rename to src/nemotron/steps/data_prep/rl_prep/step.py
index 134a6d105..963eb9215 100644
--- a/src/nemotron/steps/prep/rl_prep/step.py
+++ b/src/nemotron/steps/data_prep/rl_prep/step.py
@@ -2,7 +2,7 @@
 # /// script
 # [tool.runspec]
 # schema = "1"
-# name = "steps/prep/rl_prep"
+# name = "steps/data_prep/rl_prep"
 # image = "anyscale/ray:2.49.2-py312"
 #
 # [tool.runspec.run]
@@ -47,7 +47,7 @@
     load_omegaconf_yaml,
     parse_config_and_overrides,
 )
-from nemotron.steps.prep._common import init_prep_wandb
+from nemotron.steps.data_prep._common import init_prep_wandb
 
 DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 
@@ -64,7 +64,7 @@ def main() -> None:
     run_rl_resolve_pipeline(
         blend=DataBlend.load(cfg["blend_path"]),
         output_dir=cfg["output_dir"],
-        sample=cfg.get("sample"),
+        sample=cfg.get("max_rows") if cfg.get("max_rows") is not None else cfg.get("sample"),
         force=cfg.get("force", False),
         compression=cfg.get("compression", "none"),
         num_shards_per_split=cfg.get("num_shards_per_split", 1),
diff --git a/src/nemotron/steps/data_prep/rl_prep/step.toml b/src/nemotron/steps/data_prep/rl_prep/step.toml
new file mode 100644
index 000000000..600658136
--- /dev/null
+++ b/src/nemotron/steps/data_prep/rl_prep/step.toml
@@ -0,0 +1,91 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[step]
+id = "data_prep/rl_prep"
+name = "RL Data Prep (resolve + shard)"
+category = "data_prep"
+description = """\
+Resolve HuggingFace dataset references in an RL data blend and shard the
+output JSONL into the prompt / preference layout expected by rl/nemo_rl/*."""
+tags = ["data_prep", "rl", "dpo", "grpo", "jsonl"]
+
+[[consumes]]
+type = "training_jsonl"
+description = "RL data blend referencing HF or local prompt / preference datasets."
+
+[[produces]]
+type = "training_jsonl"
+description = "Sharded JSONL splits ready for rl/nemo_rl/{dpo,rlvr,rlhf}."
+
+[[parameters]]
+name = "blend_path"
+description = "RL data blend describing local files or HF placeholders for prompt, preference, or reward-model data."
+
+[[parameters]]
+name = "num_shards_per_split"
+description = "Output shards per split."
+default = 1
+
+[[parameters]]
+name = "resolve_hf_placeholders"
+description = "If true, materialise HF placeholders into local JSONL (recommended for closed-network clusters)."
+default = true
+
+[[parameters]]
+name = "compression"
+description = "Output JSONL compression mode. Keep none for simplest downstream debugging unless storage pressure requires compression."
+default = "none"
+
+[[parameters]]
+name = "max_rows"
+description = "Optional row cap for smoke runs. Alias for sample in the RL resolver."
+default = "null"
+
+[[strategies]]
+when = "Cluster has no HF Hub access"
+then = "Keep resolve_hf_placeholders=true so all data is materialised locally before training."
+
+[[strategies]]
+when = "Producing data for RLVR / GRPO"
+then = "Ensure each prompt record has the verifiable answer field (e.g. 'answer' for math)."
+
+[[strategies]]
+when = "Producing data for DPO"
+then = "Validate prompt, chosen, and rejected ordering on samples before sharding; inverted pairs silently train the wrong behavior."
+
+[[strategies]]
+when = "Producing data for RLHF"
+then = "Keep rollout prompt data separate from reward-model checkpoint/config references; the RLHF step consumes both but they are not the same artifact."
+
+[[errors]]
+name = "missing_or_invalid_blend"
+recovery = "Set blend_path to a resolvable RL blend and run a tiny resolve before a production shard."
+
+[[errors]]
+name = "invalid_dpo_preference_schema"
+recovery = "Ensure every DPO record has prompt, chosen, and rejected fields, and spot-check that chosen is actually preferred."
+
+[[errors]]
+name = "missing_rlvr_verifier_fields"
+recovery = "Add the answer, tests, env_metadata, or other verifier fields required by the active reward function before training."
+
+[[errors]]
+name = "split_names_do_not_match_rl_config"
+recovery = "Preserve train and validation split names expected by rl/nemo_rl configs, or override the RL data paths explicitly."
+
+[reference]
+skill = "src/nemotron/steps/data_prep/rl_prep/SKILL.md"
+script = "src/nemotron/data_prep/recipes/rl.py"
+skills = ["skills/nemotron-customize/references/context/nemotron-data-prep.txt"]
diff --git a/src/nemotron/steps/prep/sft_packing/SKILL.md b/src/nemotron/steps/data_prep/sft_packing/SKILL.md
similarity index 62%
rename from src/nemotron/steps/prep/sft_packing/SKILL.md
rename to src/nemotron/steps/data_prep/sft_packing/SKILL.md
index 2bfec47e2..c5d403ffa 100644
--- a/src/nemotron/steps/prep/sft_packing/SKILL.md
+++ b/src/nemotron/steps/data_prep/sft_packing/SKILL.md
@@ -1,11 +1,11 @@
 ---
-name: nemotron-prep-sft-packing
-description: Configure the Nemotron prep/sft_packing step that applies chat templates, tokenizes training JSONL, and emits Megatron-Bridge packed Parquet shards for SFT or PEFT. Use when preparing data for sft/megatron_bridge, peft/megatron_bridge, packed sequence training, loss-mask validation, or sequence-length alignment.
+name: nemotron-data-prep-sft-packing
+description: Configure the Nemotron data_prep/sft_packing step that applies chat templates, tokenizes training JSONL, and emits Megatron-Bridge packed Parquet shards for SFT or PEFT. Use when preparing data for sft/megatron_bridge, peft/megatron_bridge, packed sequence training, loss-mask validation, or sequence-length alignment.
 ---
 
 # SFT Packing
 
-Use `prep/sft_packing` when the downstream step consumes `packed_parquet`, especially `sft/megatron_bridge` or `peft/megatron_bridge`.
+Use `data_prep/sft_packing` when the downstream step consumes `packed_parquet`, especially `sft/megatron_bridge` or `peft/megatron_bridge`.
 
 Before changing configs or code, read `step.toml` to understand the step flow, consumed and produced artifacts, important parameters, strategies, failure modes, and upstream references.
 
@@ -21,6 +21,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - Set `pack_size` equal to downstream `seq_length`.
 - Set `chat_template` to the target model family or template path.
 - Lower `num_shards` for small samples so shards remain useful.
+- Keep `train_ratio`, `valid_ratio`, and `test_ratio` explicit when downstream
+  training expects stable split directories.
 - Check `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md` before reusing packed data after tokenizer, template, or sequence-length changes.
 - Check `src/nemotron/steps/patterns/sft-sequence-packing.md` when deciding whether packing is useful for a corpus.
 
@@ -32,10 +34,10 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Local Files
 
-- Contract: `src/nemotron/steps/prep/sft_packing/step.toml`
-- Runner: `src/nemotron/steps/prep/sft_packing/step.py`
-- Configs: `src/nemotron/steps/prep/sft_packing/config/default.yaml`, `src/nemotron/steps/prep/sft_packing/config/tiny.yaml`
-- Sample blend: `src/nemotron/steps/prep/sft_packing/data/blend_tiny.json`
+- Contract: `src/nemotron/steps/data_prep/sft_packing/step.toml`
+- Runner: `src/nemotron/steps/data_prep/sft_packing/step.py`
+- Configs: `src/nemotron/steps/data_prep/sft_packing/config/default.yaml`, `src/nemotron/steps/data_prep/sft_packing/config/tiny.yaml`
+- Sample blend: `src/nemotron/steps/data_prep/sft_packing/data/blend_tiny.json`
 
 ## Avoid
 
diff --git a/src/nemotron/steps/prep/sft_packing/config/default.yaml b/src/nemotron/steps/data_prep/sft_packing/config/default.yaml
similarity index 100%
rename from src/nemotron/steps/prep/sft_packing/config/default.yaml
rename to src/nemotron/steps/data_prep/sft_packing/config/default.yaml
diff --git a/src/nemotron/steps/prep/sft_packing/config/tiny.yaml b/src/nemotron/steps/data_prep/sft_packing/config/tiny.yaml
similarity index 83%
rename from src/nemotron/steps/prep/sft_packing/config/tiny.yaml
rename to src/nemotron/steps/data_prep/sft_packing/config/tiny.yaml
index e346d6de1..88cb7c596 100644
--- a/src/nemotron/steps/prep/sft_packing/config/tiny.yaml
+++ b/src/nemotron/steps/data_prep/sft_packing/config/tiny.yaml
@@ -17,14 +17,14 @@
 # without dragging the recipes/ tree along.
 #
 # Usage:
-#   nemotron step run prep/sft_packing -c tiny                          # local
-#   nemotron step run prep/sft_packing -c tiny -r slurm_sft_dataprep_tiny
-#   nemotron step run prep/sft_packing -c tiny -r test_lepton_sft_dataprep
+#   nemotron steps run data_prep/sft_packing -c tiny                          # local
+#   nemotron steps run data_prep/sft_packing -c tiny -r slurm_prep_sft_packing
+#   nemotron steps run data_prep/sft_packing -c tiny -r lepton_prep_sft_packing
 
 # blend_path is omitted on purpose — step.py defaults to the in-step
 # data/blend_tiny.json so the same config works under any source layout
 # (local, /nemo_run/code on slurm, /mnt/lustre-shared/_nemotron on lepton).
-output_dir: ${oc.env:SFT_OUTPUT_DIR,./output/stage1_sft_tiny}
+output_dir: ${oc.env:SFT_OUTPUT_DIR,./output/data_prep/sft_packing/tiny}
 
 tokenizer:
   model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
@@ -48,7 +48,8 @@ used_in_filter: null
 used_in_field: used_in
 
 max_doc_tokens: null
-sample: 1000
+max_rows: 1000
+sample: null
 sample_seed: 42
 force: false
 config_name: tiny
diff --git a/src/nemotron/steps/prep/sft_packing/data/blend_tiny.json b/src/nemotron/steps/data_prep/sft_packing/data/blend_tiny.json
similarity index 100%
rename from src/nemotron/steps/prep/sft_packing/data/blend_tiny.json
rename to src/nemotron/steps/data_prep/sft_packing/data/blend_tiny.json
diff --git a/src/nemotron/steps/prep/sft_packing/step.py b/src/nemotron/steps/data_prep/sft_packing/step.py
similarity index 98%
rename from src/nemotron/steps/prep/sft_packing/step.py
rename to src/nemotron/steps/data_prep/sft_packing/step.py
index f97e609b8..26af68ce7 100644
--- a/src/nemotron/steps/prep/sft_packing/step.py
+++ b/src/nemotron/steps/data_prep/sft_packing/step.py
@@ -2,7 +2,7 @@
 # /// script
 # [tool.runspec]
 # schema = "1"
-# name = "steps/prep/sft_packing"
+# name = "steps/data_prep/sft_packing"
 # image = "anyscale/ray:2.49.2-py312"
 #
 # [tool.runspec.run]
@@ -52,7 +52,7 @@
     load_omegaconf_yaml,
     parse_config_and_overrides,
 )
-from nemotron.steps.prep._common import (
+from nemotron.steps.data_prep._common import (
     chdir_to_scratch,
     config_dataclass,
     init_prep_wandb,
diff --git a/src/nemotron/steps/prep/sft_packing/step.toml b/src/nemotron/steps/data_prep/sft_packing/step.toml
similarity index 76%
rename from src/nemotron/steps/prep/sft_packing/step.toml
rename to src/nemotron/steps/data_prep/sft_packing/step.toml
index f49faafa6..9464485ed 100644
--- a/src/nemotron/steps/prep/sft_packing/step.toml
+++ b/src/nemotron/steps/data_prep/sft_packing/step.toml
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 [step]
-id = "prep/sft_packing"
+id = "data_prep/sft_packing"
 name = "SFT Data Packing"
-category = "prep"
+category = "data_prep"
 description = """Apply the chat template, tokenize training JSONL, and pack examples into Megatron-Bridge-compatible Parquet shards for SFT."""
-tags = ["prep", "sft", "packing", "parquet", "megatron-bridge"]
+tags = ["data_prep", "sft", "packing", "parquet", "megatron-bridge"]
 
 [[consumes]]
 type = "training_jsonl"
@@ -53,6 +53,21 @@ name = "num_shards"
 description = "Number of packed Parquet shards to materialize for downstream loading."
 default = 128
 
+[[parameters]]
+name = "train_ratio"
+description = "Training split ratio used to materialize the canonical splits/{train,valid,test}/ Parquet layout when explicit split shard counts are not set."
+default = 0.98
+
+[[parameters]]
+name = "valid_ratio"
+description = "Validation split ratio used when explicit valid_shards is not set."
+default = 0.01
+
+[[parameters]]
+name = "test_ratio"
+description = "Test split ratio used when explicit test_shards is not set."
+default = 0.01
+
 [[strategies]]
 when = "The downstream trainer is sft/megatron_bridge or another consumer of packed Parquet"
 then = "Run this converter step before training so the dataset is materialized in packed_parquet format."
@@ -69,6 +84,10 @@ then = "Keep pack_size identical to the downstream training seq_length to avoid
 when = "You are iterating on a very small dataset or sample"
 then = "Reduce num_shards so each shard remains meaningfully sized and split generation stays balanced."
 
+[[strategies]]
+when = "You are validating a new chat template or tool-call format"
+then = "Inspect formatted prompts and loss_mask spans before treating packing efficiency or training loss as meaningful."
+
 [[errors]]
 name = "tokenizer_mismatch"
 recovery = "Use the same tokenizer and chat template as the downstream model; repack if either changes."
@@ -81,7 +100,12 @@ recovery = "Match pack_size to the downstream seq_length and packed_sequence_siz
 name = "too_many_tiny_shards"
 recovery = "Lower num_shards for small datasets or sampled runs so throughput and split balance remain healthy."
 
+[[errors]]
+name = "bad_loss_masks"
+recovery = "Inspect a few packed records for assistant-only loss masks, especially after changing chat_template, messages_field, tools_field, or tool-call formatting."
+
 [reference]
+skill = "src/nemotron/steps/data_prep/sft_packing/SKILL.md"
 recipe = "src/nemotron/recipes/nano3/stage1_sft/"
 script = "src/nemotron/recipes/nano3/stage1_sft/data_prep.py"
 docs = "src/nemotron/recipes/nano3/stage1_sft/README.md"
@@ -90,5 +114,5 @@ megatron_bridge_readme = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/ma
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 packed_sequences_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/packed-sequences.html"
 skills = [
-  "skills/nemotron-customize/context/nemotron-data-prep.txt",
+  "skills/nemotron-customize/references/context/nemotron-data-prep.txt",
 ]
diff --git a/src/nemotron/steps/env/SKILL.md b/src/nemotron/steps/env/SKILL.md
index e95691524..49e81a947 100644
--- a/src/nemotron/steps/env/SKILL.md
+++ b/src/nemotron/steps/env/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: nemotron-env
-description: Work with Nemotron execution environment profiles, especially env.toml generation, Lepton executor settings, RayCluster resources, mounts, container images, and profile inheritance. Use when selecting, creating, or debugging run profiles.
+description: Work with Nemotron execution environment profiles, especially env.toml generation, Lepton, Slurm, and DGX Cloud executor settings, RayCluster resources, mounts, container images, and profile inheritance. Use when selecting, creating, or debugging run profiles.
 ---
 
 # Env Steps
@@ -9,14 +9,14 @@ Use this category for execution-profile setup under `src/nemotron/steps/env/`.
 
 ## Route
 
-- `env/env_toml`: generate and validate starter env profile examples for Lepton or Slurm.
+- `env/env_toml`: generate and validate starter env profile examples for Lepton, Slurm, or DGX Cloud.
 
 ## Guardrails
 
 - Read the specific step `SKILL.md` and `step.toml` before editing env profiles.
-- Keep env profile files at the repository root. Default profile discovery uses `env.toml`; generated backend examples use `env.lepton.toml` or `env.slurm.toml` and require `export NEMOTRON_ENV_FILE=<file>`.
+- Keep env profile files at the repository root. Default profile discovery uses `env.toml`; generated backend examples use `env.lepton.toml`, `env.slurm.toml`, or `env.dgxcloud.toml` and require `export NEMOTRON_ENV_FILE=<file>`.
 - If the target env file exists, inspect and extend it rather than overwriting; only use `force=true` when the user intentionally asks to replace it.
 - Keep site logistics in env profiles and step runtime flags in the step YAML unless the flag is truly site-wide.
-- Keep data-prep step profiles CPU-only unless the step explicitly needs GPUs. Slurm prep profiles should override GPU bases with CPU partitions and `gpus_per_node = 0`; Lepton prep profiles should use a CPU resource shape.
-- Use the NeMo-RL v0.6.0 image for DPO/RLVR/RLHF profiles on Lepton and Slurm. On Lepton, keep `ray_version` on the latest version supported by the workspace rather than blindly matching the upstream NeMo-RL Ray pin.
+- Keep data-prep step profiles CPU-only unless the step explicitly needs GPUs. Slurm prep profiles should override GPU bases with CPU partitions and `gpus_per_node = 0`; Lepton prep profiles should use a CPU resource shape; DGX Cloud prep profiles should keep `gpus_per_node = 0` and stage source on `PYTHONPATH` for Ray workers.
+- Use the NeMo-RL v0.6.0 image for DPO/RLVR/RLHF profiles on Lepton, Slurm, and DGX Cloud. On Lepton, keep `ray_version` on the latest version supported by the workspace rather than blindly matching the upstream NeMo-RL Ray pin.
 - Compile one small run after profile changes and inspect `run.env` before submitting.
diff --git a/src/nemotron/steps/env/env_toml/SKILL.md b/src/nemotron/steps/env/env_toml/SKILL.md
index 256fe43ab..71604b5c2 100644
--- a/src/nemotron/steps/env/env_toml/SKILL.md
+++ b/src/nemotron/steps/env/env_toml/SKILL.md
@@ -1,13 +1,13 @@
 ---
 name: nemotron-env-toml
-description: Create, validate, and adjust Nemotron env.toml profiles for Lepton, Ray, NeMo-RL, NeMo-Gym, AutoModel, ModelOpt, and functional step runs. Use when a user needs an env.toml from scratch, profile inheritance fixes, executor/resource profile advice, or debugging Ray runtime-env and placement issues.
+description: Create, validate, and adjust Nemotron env.toml profiles for Lepton, Slurm, DGX Cloud, Ray, NeMo-RL, NeMo-Gym, AutoModel, ModelOpt, Curator, Data Designer SDG, and functional step runs. Use when a user needs an env.toml from scratch, profile inheritance fixes, executor/resource profile advice, or debugging Ray runtime-env and placement issues.
 ---
 
 # Env TOML
 
 Use `env/env_toml` to generate starter env profile files and to preserve hard-earned profile conventions.
 
-Before editing profiles, read `step.toml` for the contract, then choose `config/lepton.yaml` or `config/slurm.yaml`.
+Before editing profiles, read `step.toml` for the contract, then choose `config/lepton.yaml`, `config/slurm.yaml`, or `config/dgxcloud.yaml`.
 
 ## Workflow
 
@@ -16,31 +16,39 @@ Before editing profiles, read `step.toml` for the contract, then choose `config/
 2. When no backend env file exists, generate one:
 
 ```bash
-uv run nemotron step run env/env_toml -c lepton
+uv run nemotron steps run env/env_toml -c lepton
 export NEMOTRON_ENV_FILE=env.lepton.toml
 ```
 
 For Slurm:
 
 ```bash
-uv run nemotron step run env/env_toml -c slurm
+uv run nemotron steps run env/env_toml -c slurm
 export NEMOTRON_ENV_FILE=env.slurm.toml
 ```
 
-The loader in `src/nemo_runspec/env.py` searches for repository-root `env.toml` by default. To use a generated backend file, set `NEMOTRON_ENV_FILE` to `env.lepton.toml` or `env.slurm.toml` before running `--run` or `--batch` commands.
+For DGX Cloud:
+
+```bash
+uv run nemotron steps run env/env_toml -c dgxcloud
+export NEMOTRON_ENV_FILE=env.dgxcloud.toml
+```
+
+The loader in `src/nemo_runspec/env.py` searches for repository-root `env.toml` by default. To use a generated backend file, set `NEMOTRON_ENV_FILE` to `env.lepton.toml`, `env.slurm.toml`, or `env.dgxcloud.toml` before running `--run` or `--batch` commands.
 
 3. Set site-specific values in config overrides or edit the generated file:
    - Lepton: `node_group`, `resource_shape`, `nemo_run_dir`, mount `path`/`from`
    - Slurm: `host`, `user`, `account`, `partition`, `remote_job_dir`, mounts
+   - DGX Cloud: `base_url`, `kube_apiserver_url`, `client_id`, `project_name`, PVC name/claim/path, `pvc_nemo_run_dir`
    - Shared: container image, `HF_HOME`, output directories, W&B project/entity
 
 4. Validate by compiling one small case before submitting.
 
 ## Nuances
 
-- Prefer one backend base and concrete profiles named for individual steps, such as `lepton_prep_sft_packing`, `lepton_pretrain_megatron_bridge`, or `slurm_optimize_modelopt_quantize`.
+- Prefer one backend base and concrete profiles named for individual steps, such as `lepton_prep_sft_packing`, `lepton_pretrain_megatron_bridge`, `lepton_sdg_data_designer_tiny`, `slurm_optimize_modelopt_quantize`, or `dgxcloud_pretrain_megatron_bridge`.
 - Env profiles are inherited with `extends`; child profiles should override only what the step needs, such as image, node count, startup commands, or output path.
-- Data-prep profiles should be CPU-only by default. For Slurm prep profiles, override the GPU base with CPU partitions, `gpus_per_node = 0`, `build_include_gpus = false`, and enough `cpus_per_task` for Ray/Xenna. For Lepton prep profiles, use a CPU `resource_shape` and `gpus_per_node = 0`.
+- Data-prep profiles should be CPU-only by default. For Slurm prep profiles, override the GPU base with CPU partitions, `gpus_per_node = 0`, `build_include_gpus = false`, and enough `cpus_per_task` for Ray/Xenna. For Lepton prep profiles, use a CPU `resource_shape` and `gpus_per_node = 0`. For DGX Cloud prep profiles, keep `gpus_per_node = 0` and provide `RAY_RUNTIME_ENV_PYTHONPATH` for staged source.
 - Keep secrets as `${oc.env:...}` placeholders. Do not write tokens directly into env files.
 - Keep `[wandb]` for run metadata and pass `WANDB_API_KEY`/`WANDB_PROJECT` through profile `env_vars` so subprocess-heavy steps such as ModelOpt pruning/quantization inherit logging settings.
 - Do not put every NeMo-RL runtime flag in env files. Step YAML `run.env.env_vars` carries runtime-specific flags; the config loader deep-merges those with the selected env profile.
@@ -48,6 +56,8 @@ The loader in `src/nemo_runspec/env.py` searches for repository-root `env.toml`
 - For Ray jobs, avoid job `runtime_env` workdirs when vLLM or NeMo-RL starts nested Ray actors. Use staged source plus `PYTHONPATH` and keep source-transport cleanup in the runner, not in env.toml profiles.
 - For RLHF with GenRM, budget physical Ray nodes for policy/generation, NeMo-Gym GPU servers, and extra placement headroom. For example, a small logical `cluster.num_nodes=2` plus `env.nemo_gym.num_gpu_nodes=1` should use a 4x8-GPU Lepton profile until proven stable.
 - Use separate image bases: NeMo for Megatron Bridge, NeMo-RL `nvcr.io/nvidia/nemo-rl:v0.6.0` for DPO/RLVR/RLHF, NeMo-AutoModel for AutoModel, and NeMo 26.02 for ModelOpt.
+- For DGX Cloud profiles, keep Run:AI credentials and client secrets as `${oc.env:...}` placeholders, use existing PVC declarations for shared storage, and keep `pvc_nemo_run_dir` on the mounted workspace path.
+- Use Curator image profiles for `byob/mcq`, `translate/nemo_curator`, and `curate/nemo_curator`; use the normal NeMo image with `data-designer==0.5.5` for `sdg/data_designer`.
 - For Lepton NeMo-RL profiles, keep `ray_version` on the latest workspace-supported Ray version. NeMo-RL v0.6.0 pins Ray 2.54 upstream, but some Lepton workspaces may only accept older Ray versions such as 2.48.0.
 - Keep functional runner `gpu_count` aligned with the env profile, not only the step config.
 
@@ -55,7 +65,7 @@ The loader in `src/nemo_runspec/env.py` searches for repository-root `env.toml`
 
 - Contract: `src/nemotron/steps/env/env_toml/step.toml`
 - Runner: `src/nemotron/steps/env/env_toml/step.py`
-- Configs: `src/nemotron/steps/env/env_toml/config/lepton.yaml`, `src/nemotron/steps/env/env_toml/config/slurm.yaml`
+- Configs: `src/nemotron/steps/env/env_toml/config/lepton.yaml`, `src/nemotron/steps/env/env_toml/config/slurm.yaml`, `src/nemotron/steps/env/env_toml/config/dgxcloud.yaml`
 - Loader behavior: `src/nemo_runspec/env.py`, `src/nemo_runspec/config/loader.py`
 
 ## Guardrails
diff --git a/src/nemotron/steps/env/env_toml/config/dgxcloud.yaml b/src/nemotron/steps/env/env_toml/config/dgxcloud.yaml
new file mode 100644
index 000000000..adbb00f04
--- /dev/null
+++ b/src/nemotron/steps/env/env_toml/config/dgxcloud.yaml
@@ -0,0 +1,342 @@
+# Generate a DGX Cloud env.dgxcloud.toml that demonstrates step-linked profiles.
+#
+# Base Run:AI / DGX Cloud wiring is based on env.old.toml. Keep credentials in
+# environment variables; do not commit client secrets or access tokens.
+
+output_path: env.dgxcloud.toml
+force: false
+
+preamble: |
+  # Generated by `nemotron steps run env/env_toml -c dgxcloud`.
+  # Keep secrets out of this file; pass tokens through ${oc.env:...} placeholders.
+  #
+  # Example usage after review:
+  #   export NEMOTRON_ENV_FILE=env.dgxcloud.toml
+  #   uv run nemotron steps run data_prep/sft_packing -c tiny --batch dgxcloud_prep_sft_packing
+  #   uv run nemotron steps run data_prep/pretrain_prep -c tiny --batch dgxcloud_prep_pretrain_prep
+  #   uv run nemotron steps run data_prep/rl_prep -c tiny --batch dgxcloud_prep_rl_prep
+  #   uv run nemotron steps run pretrain/megatron_bridge -c tiny --batch dgxcloud_pretrain_megatron_bridge
+  #   uv run nemotron steps run pretrain/automodel -c tiny --batch dgxcloud_pretrain_automodel
+  #   uv run nemotron steps run sft/megatron_bridge -c tiny --batch dgxcloud_sft_megatron_bridge
+  #   uv run nemotron steps run sft/automodel -c tiny --batch dgxcloud_sft_automodel
+  #   uv run nemotron steps run peft/megatron_bridge -c tiny --batch dgxcloud_peft_megatron_bridge
+  #   uv run nemotron steps run peft/automodel -c tiny --batch dgxcloud_peft_automodel
+  #   uv run nemotron steps run convert/hf_to_megatron -c default --batch dgxcloud_convert_model
+  #   uv run nemotron steps run convert/megatron_to_hf -c default --batch dgxcloud_convert_model
+  #   uv run nemotron steps run convert/merge_lora -c default --batch dgxcloud_convert_model
+  #   uv run nemotron steps run rl/nemo_rl/dpo -c tiny --batch dgxcloud_rl_nemo_rl_dpo
+  #   uv run nemotron steps run rl/nemo_rl/rlvr -c tiny --batch dgxcloud_rl_nemo_rl_rlvr
+  #   uv run nemotron steps run rl/nemo_rl/rlhf -c tiny --batch dgxcloud_rl_nemo_rl_rlhf
+  #   uv run nemotron steps run optimize/modelopt/quantize -c tiny --batch dgxcloud_optimize_modelopt_quantize
+  #   uv run nemotron steps run optimize/modelopt/distill -c tiny --batch dgxcloud_optimize_modelopt_distill
+  #   uv run nemotron steps run optimize/modelopt/prune -c tiny --batch dgxcloud_optimize_modelopt_prune
+  #   uv run nemotron steps run eval/model_eval -c tiny_chat --batch dgxcloud_eval_model_eval
+  #   uv run nemotron steps run byob/mcq -c tiny --batch dgxcloud_byob_cpu stage=all
+  #   uv run nemotron steps run byob/mcq -c tiny --batch dgxcloud_byob_full stage=all
+  #   uv run nemotron steps run sdg/data_designer -c tiny --batch dgxcloud_sdg_data_designer_tiny
+  #   uv run nemotron steps run translate/nemo_curator -c default --batch dgxcloud_translate
+  #   uv run nemotron steps run curate/nemo_curator -c tiny --batch dgxcloud_curate
+
+checks:
+  required_profiles:
+    - dgxcloud_base
+    - dgxcloud_curator_base
+    - dgxcloud_prep_base
+    - dgxcloud_prep_sft_packing
+    - dgxcloud_prep_pretrain_prep
+    - dgxcloud_prep_rl_prep
+    - dgxcloud_pretrain_megatron_bridge
+    - dgxcloud_pretrain_automodel
+    - dgxcloud_sft_megatron_bridge
+    - dgxcloud_sft_automodel
+    - dgxcloud_peft_megatron_bridge
+    - dgxcloud_peft_automodel
+    - dgxcloud_convert_model
+    - dgxcloud_rl_nemo_rl_dpo
+    - dgxcloud_rl_nemo_rl_rlvr
+    - dgxcloud_rl_nemo_rl_rlhf
+    - dgxcloud_optimize_modelopt_quantize
+    - dgxcloud_optimize_modelopt_distill
+    - dgxcloud_optimize_modelopt_prune
+    - dgxcloud_eval_model_eval
+    - dgxcloud_byob_cpu
+    - dgxcloud_byob_full
+    - dgxcloud_sdg_data_designer
+    - dgxcloud_sdg_data_designer_tiny
+    - dgxcloud_sdg_data_designer_default
+    - dgxcloud_sdg_data_designer_customer_support_tools
+    - dgxcloud_sdg_data_designer_rl_pref
+    - dgxcloud_translate
+    - dgxcloud_curate
+  recommended_min_nodes:
+    # RLHF with GenRM needs policy/generation workers, NeMo-Gym GPU servers, and placement headroom.
+    dgxcloud_rl_nemo_rl_rlhf: 4
+
+sections:
+  wandb:
+    project: nemotron
+    entity: ${oc.env:WANDB_ENTITY,nvidia}
+
+  dgxcloud_base:
+    executor: dgxcloud
+    base_url: ${oc.env:DGXCLOUD_BASE_URL,https://masked-dgxcloud-runai-host/api/v1}
+    kube_apiserver_url: ${oc.env:DGXCLOUD_KUBE_APISERVER_URL,https://masked-dgxcloud-runai-host/k8s/clusters/masked-cluster-id}
+    client_id: ${oc.env:DGXCLOUD_CLIENT_ID,masked-client-id}
+    client_secret: ${oc.env:DGXCLOUD_CLIENT_SECRET,''}
+    project_name: ${oc.env:DGXCLOUD_PROJECT_NAME,masked-project-name}
+    container_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano
+    pvc_nemo_run_dir: ${oc.env:NEMO_RUN_DIR,/workspace/nemo_run}
+    nodes: 1
+    gpus_per_node: 0
+    nprocs_per_node: 1
+    distributed_framework: PyTorch
+    pip_extras: [typer, rich, pydantic-settings]
+    pvcs:
+      - name: ${oc.env:DGXCLOUD_PVC_NAME,playbook-storage}
+        path: ${oc.env:DGXCLOUD_PVC_MOUNT,/workspace}
+        existingPvc: true
+        claimName: ${oc.env:DGXCLOUD_PVC_CLAIM,masked-pvc-claim-name}
+    env_vars:
+      HF_TOKEN: ${oc.env:HF_TOKEN,''}
+      HF_HOME: ${oc.env:HF_HOME,/workspace/hf}
+      WANDB_API_KEY: ${oc.env:WANDB_API_KEY,''}
+      WANDB_PROJECT: nemotron
+      RAY_DEDUP_LOGS: "0"
+
+  # data_prep/*: CPU-only data prep on DGX Cloud. The step's Ray workers need the
+  # staged source tree on PYTHONPATH.
+  dgxcloud_prep_base:
+    extends: dgxcloud_base
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages cosmos-xenna protobuf
+    env_vars:
+      RAY_RUNTIME_ENV_PYTHONPATH: /workspace/_nemotron/src
+
+  dgxcloud_prep_sft_packing:
+    extends: dgxcloud_prep_base
+    env_vars:
+      SFT_OUTPUT_DIR: /workspace/output/data_prep/sft_packing
+      RAY_RUNTIME_ENV_PYTHONPATH: /workspace/_nemotron/src
+
+  dgxcloud_prep_pretrain_prep:
+    extends: dgxcloud_prep_base
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: /workspace/output/data_prep/pretrain_prep
+      RAY_RUNTIME_ENV_PYTHONPATH: /workspace/_nemotron/src
+
+  dgxcloud_prep_rl_prep:
+    extends: dgxcloud_prep_base
+    env_vars:
+      RL_OUTPUT_DIR: /workspace/output/data_prep/rl_prep
+      RAY_RUNTIME_ENV_PYTHONPATH: /workspace/_nemotron/src
+
+  # Curator-backed steps: byob/mcq, translate/nemo_curator, and curate all use the Curator
+  # container and install their Python runtime through the generic curator_runtime
+  # bootstrap, using requirements generated from pyproject/uv.lock at submission time.
+  dgxcloud_curator_base:
+    extends: dgxcloud_base
+    container_image: nvcr.io/nvidia/nemo-curator:26.02
+    nodes: 1
+    gpus_per_node: 0
+    nprocs_per_node: 1
+    pip_extras: []
+
+  # byob CPU smoke: keeps semantic dedup, coverage, and semantic outlier stages
+  # disabled. Those optional stages need the GPU/RAPIDS profile below.
+  dgxcloud_byob_cpu:
+    extends: dgxcloud_curator_base
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile byob -- python -m nemotron.steps.byob.mcq.step --config {config}
+    env_vars:
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+      NGC_API_KEY: ${oc.env:NGC_API_KEY,''}
+
+  # byob full: adds the heavy GPU stack required by Curator semantic dedup and
+  # related embedding/semantic quality stages.
+  dgxcloud_byob_full:
+    extends: dgxcloud_byob_cpu
+    gpus_per_node: 1
+    nprocs_per_node: 1
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile byob-gpu -- python -m nemotron.steps.byob.mcq.step --config {config}
+
+  # sdg/data_designer: CPU-only synthetic generation through remote model
+  # endpoints. Config variants choose SFT vs tool-call vs RL preference output.
+  dgxcloud_sdg_data_designer:
+    extends: dgxcloud_base
+    gpus_per_node: 0
+    nprocs_per_node: 1
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages data-designer==0.5.5
+    env_vars:
+      SDG_OUTPUT_DIR: /workspace/output/sdg
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+
+  dgxcloud_sdg_data_designer_tiny:
+    extends: dgxcloud_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /workspace/output/sdg/sft/tiny
+
+  dgxcloud_sdg_data_designer_default:
+    extends: dgxcloud_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /workspace/output/sdg/sft/default
+
+  dgxcloud_sdg_data_designer_customer_support_tools:
+    extends: dgxcloud_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /workspace/output/sdg/sft/customer_support_tools
+
+  dgxcloud_sdg_data_designer_rl_pref:
+    extends: dgxcloud_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /workspace/output/sdg/rl/rl_pref
+
+  # translate/nemo_curator: uses the translate Curator runtime profile.
+  dgxcloud_translate:
+    extends: dgxcloud_curator_base
+    gpus_per_node: 1
+    nprocs_per_node: 1
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile translate -- python -m nemotron.steps.translate.nemo_curator.step --config {config}
+    env_vars:
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+      NGC_API_KEY: ${oc.env:NGC_API_KEY,''}
+
+  # curate/nemo_curator: runs directly in the Curator container.
+  dgxcloud_curate:
+    extends: dgxcloud_curator_base
+    env_vars:
+      NEMOTRON_CURATOR_RAY_NUM_CPUS: "4"
+
+  # pretrain/megatron_bridge: GPU-backed training profile.
+  dgxcloud_pretrain_megatron_bridge:
+    extends: dgxcloud_base
+    nodes: 2
+    gpus_per_node: 8
+    nprocs_per_node: 8
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: /workspace/output/pretrain/megatron_bridge
+      PRETRAIN_BLEND_PATH: /workspace/output/data_prep/pretrain_prep/blend.json
+
+  # pretrain/automodel: AutoModel pretrain uses the AutoModel image/deps with
+  # the same pretrain data-prep output wiring.
+  dgxcloud_pretrain_automodel:
+    extends: dgxcloud_base
+    container_image: nvcr.io/nvidia/nemo-automodel:26.04
+    nodes: 2
+    gpus_per_node: 8
+    nprocs_per_node: 8
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages omegaconf
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: /workspace/output/pretrain/automodel
+      PRETRAIN_BLEND_PATH: /workspace/output/data_prep/pretrain_prep/blend.json
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # sft/megatron_bridge: GPU-backed SFT profile.
+  dgxcloud_sft_megatron_bridge:
+    extends: dgxcloud_base
+    nodes: 2
+    gpus_per_node: 8
+    nprocs_per_node: 8
+    env_vars:
+      SFT_OUTPUT_DIR: /workspace/output/sft/megatron_bridge
+      SFT_PACKED_DIR: /workspace/output/data_prep/sft_packing/splits/train/*.parquet
+
+  # sft/automodel: AutoModel image/deps with the same DGX Cloud logistics.
+  dgxcloud_sft_automodel:
+    extends: dgxcloud_sft_megatron_bridge
+    container_image: nvcr.io/nvidia/nemo-automodel:26.04
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages omegaconf
+    env_vars:
+      SFT_OUTPUT_DIR: /workspace/output/sft/automodel
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # peft/*: PEFT uses the same training backends as SFT but writes to PEFT paths.
+  dgxcloud_peft_megatron_bridge:
+    extends: dgxcloud_sft_megatron_bridge
+    env_vars:
+      SFT_OUTPUT_DIR: /workspace/output/peft/megatron_bridge
+      SFT_PACKED_DIR: /workspace/output/data_prep/sft_packing/splits/train/*.parquet
+
+  dgxcloud_peft_automodel:
+    extends: dgxcloud_sft_automodel
+    env_vars:
+      SFT_OUTPUT_DIR: /workspace/output/peft/automodel
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # convert/*: one shared GPU profile for HF<->Megatron conversion and LoRA merge/export.
+  dgxcloud_convert_model:
+    extends: dgxcloud_base
+    nodes: 1
+    gpus_per_node: 8
+    nprocs_per_node: 8
+    env_vars:
+      CONVERT_OUTPUT_DIR: /workspace/output/convert
+
+  # rl/nemo_rl/dpo: NeMo-RL image plus output override. Runtime RL knobs stay in step YAML.
+  dgxcloud_rl_nemo_rl_dpo:
+    extends: dgxcloud_base
+    container_image: nvcr.io/nvidia/nemo-rl:v0.6.0
+    nodes: 2
+    gpus_per_node: 8
+    nprocs_per_node: 8
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages omegaconf
+    env_vars:
+      RL_OUTPUT_DIR: /workspace/output/rl/dpo
+
+  # rl/nemo_rl/rlvr: same NeMo-RL image, with RLVR output wiring.
+  dgxcloud_rl_nemo_rl_rlvr:
+    extends: dgxcloud_rl_nemo_rl_dpo
+    env_vars:
+      RL_OUTPUT_DIR: /workspace/output/rl/rlvr
+
+  # rl/nemo_rl/rlhf: same RL profile, but more physical nodes for GenRM/NeMo-Gym placement.
+  dgxcloud_rl_nemo_rl_rlhf:
+    extends: dgxcloud_rl_nemo_rl_dpo
+    nodes: 4
+    env_vars:
+      RL_OUTPUT_DIR: /workspace/output/rl/rlhf
+
+  # optimize/modelopt/quantize: switch to the ModelOpt-capable NeMo image.
+  dgxcloud_optimize_modelopt_quantize:
+    extends: dgxcloud_base
+    container_image: nvcr.io/nvidia/nemo:26.02
+    nodes: 2
+    gpus_per_node: 8
+    nprocs_per_node: 8
+    env_vars:
+      OPTIM_OUTPUT_DIR: /workspace/output/optimize/quantize
+
+  # optimize/modelopt/distill: same ModelOpt-capable image with distillation output wiring.
+  dgxcloud_optimize_modelopt_distill:
+    extends: dgxcloud_optimize_modelopt_quantize
+    env_vars:
+      OPTIM_OUTPUT_DIR: /workspace/output/optimize/distill
+
+  # optimize/modelopt/prune: pruning can run with fewer GPUs than full-node train jobs.
+  dgxcloud_optimize_modelopt_prune:
+    extends: dgxcloud_base
+    container_image: nvcr.io/nvidia/nemo:26.02
+    nodes: 1
+    gpus_per_node: 2
+    nprocs_per_node: 2
+    env_vars:
+      OPTIM_OUTPUT_DIR: /workspace/output/optimize/prune
+
+  # eval/model_eval: lightweight profile for model evaluation steps.
+  dgxcloud_eval_model_eval:
+    extends: dgxcloud_base
+    nodes: 1
+    gpus_per_node: 1
+    nprocs_per_node: 1
+    pip_extras:
+      - typer
+      - rich
+      - pydantic-settings
+      - nemo-evaluator-launcher
+    env_vars:
+      EVAL_OUTPUT_DIR: /workspace/output/eval/model_eval
diff --git a/src/nemotron/steps/env/env_toml/config/lepton.yaml b/src/nemotron/steps/env/env_toml/config/lepton.yaml
index 5d631d04f..917458d61 100644
--- a/src/nemotron/steps/env/env_toml/config/lepton.yaml
+++ b/src/nemotron/steps/env/env_toml/config/lepton.yaml
@@ -8,29 +8,68 @@ output_path: env.lepton.toml
 force: false
 
 preamble: |
-  # Generated by `nemotron step run env/env_toml -c lepton`.
+  # Generated by `nemotron steps run env/env_toml -c lepton`.
   # Keep secrets and site-private names out of this file; pass them through
   # ${oc.env:...} placeholders.
   #
   # Example usage after review:
   #   export NEMOTRON_ENV_FILE=env.lepton.toml
-  #   uv run nemotron step run prep/sft_packing -c default --batch lepton_prep_sft_packing
-  #   uv run nemotron step run pretrain/megatron_bridge -c default --batch lepton_pretrain_megatron_bridge
-  #   uv run nemotron step run rl/nemo_rl/dpo -c default --batch lepton_rl_nemo_rl_dpo
-  #   uv run nemotron step run rl/nemo_rl/rlvr -c default --batch lepton_rl_nemo_rl_rlvr
-  #   uv run nemotron step run rl/nemo_rl/rlhf -c default --batch lepton_rl_nemo_rl_rlhf
-  #   uv run nemotron step run optimize/modelopt/quantize -c default --batch lepton_optimize_modelopt_quantize
+  #   uv run nemotron steps run data_prep/sft_packing -c tiny --batch lepton_prep_sft_packing
+  #   uv run nemotron steps run data_prep/pretrain_prep -c tiny --batch lepton_prep_pretrain_prep
+  #   uv run nemotron steps run data_prep/rl_prep -c tiny --batch lepton_prep_rl_prep
+  #   uv run nemotron steps run pretrain/megatron_bridge -c tiny --batch lepton_pretrain_megatron_bridge
+  #   uv run nemotron steps run pretrain/automodel -c tiny --batch lepton_pretrain_automodel
+  #   uv run nemotron steps run sft/megatron_bridge -c tiny --batch lepton_sft_megatron_bridge
+  #   uv run nemotron steps run sft/automodel -c tiny --batch lepton_sft_automodel
+  #   uv run nemotron steps run peft/megatron_bridge -c tiny --batch lepton_peft_megatron_bridge
+  #   uv run nemotron steps run peft/automodel -c tiny --batch lepton_peft_automodel
+  #   uv run nemotron steps run convert/hf_to_megatron -c default --batch lepton_convert_model
+  #   uv run nemotron steps run convert/megatron_to_hf -c default --batch lepton_convert_model
+  #   uv run nemotron steps run convert/merge_lora -c default --batch lepton_convert_model
+  #   uv run nemotron steps run rl/nemo_rl/dpo -c tiny --batch lepton_rl_nemo_rl_dpo
+  #   uv run nemotron steps run rl/nemo_rl/rlvr -c tiny --batch lepton_rl_nemo_rl_rlvr
+  #   uv run nemotron steps run rl/nemo_rl/rlhf -c tiny --batch lepton_rl_nemo_rl_rlhf
+  #   uv run nemotron steps run optimize/modelopt/quantize -c tiny --batch lepton_optimize_modelopt_quantize
+  #   uv run nemotron steps run optimize/modelopt/distill -c tiny --batch lepton_optimize_modelopt_distill
+  #   uv run nemotron steps run optimize/modelopt/prune -c tiny --batch lepton_optimize_modelopt_prune
+  #   uv run nemotron steps run eval/model_eval -c tiny_chat --batch lepton_eval_model_eval
+  #   uv run nemotron steps run byob/mcq -c tiny --batch lepton_byob_cpu stage=all
+  #   uv run nemotron steps run byob/mcq -c tiny --batch lepton_byob_full stage=all
+  #   uv run nemotron steps run sdg/data_designer -c tiny --batch lepton_sdg_data_designer_tiny
+  #   uv run nemotron steps run translate/nemo_curator -c default --batch lepton_translate
+  #   uv run nemotron steps run curate/nemo_curator -c tiny --batch lepton_curate
 
 checks:
   required_profiles:
     - lepton_base
+    - lepton_curator_base
+    - lepton_prep_base
     - lepton_prep_sft_packing
+    - lepton_prep_pretrain_prep
+    - lepton_prep_rl_prep
     - lepton_pretrain_megatron_bridge
+    - lepton_pretrain_automodel
     - lepton_sft_megatron_bridge
+    - lepton_sft_automodel
+    - lepton_peft_megatron_bridge
+    - lepton_peft_automodel
+    - lepton_convert_model
     - lepton_rl_nemo_rl_dpo
     - lepton_rl_nemo_rl_rlvr
     - lepton_rl_nemo_rl_rlhf
     - lepton_optimize_modelopt_quantize
+    - lepton_optimize_modelopt_distill
+    - lepton_optimize_modelopt_prune
+    - lepton_eval_model_eval
+    - lepton_byob_cpu
+    - lepton_byob_full
+    - lepton_sdg_data_designer
+    - lepton_sdg_data_designer_tiny
+    - lepton_sdg_data_designer_default
+    - lepton_sdg_data_designer_customer_support_tools
+    - lepton_sdg_data_designer_rl_pref
+    - lepton_translate
+    - lepton_curate
   recommended_min_nodes:
     # RLHF with GenRM needs policy/generation workers, NeMo-Gym GPU servers, and placement headroom.
     lepton_rl_nemo_rl_rlhf: 4
@@ -65,35 +104,156 @@ sections:
       HF_HOME: ${oc.env:HF_HOME,/mnt/lustre-shared/hf}
       WANDB_API_KEY: ${oc.env:WANDB_API_KEY,''}
       WANDB_PROJECT: nemotron
+      RAY_ENABLE_UV_RUN_RUNTIME_ENV: "0"
       RAY_DEDUP_LOGS: "0"
       RAY_GRAFANA_IFRAME_HOST: ""
 
-  # prep/sft_packing: data prep is CPU-only. Override the GPU base so Lepton
+  # data_prep/*: prep jobs are CPU-only. Override the GPU base so Lepton
   # schedules a CPU shape and the executor does not request GPUs.
-  lepton_prep_sft_packing:
+  lepton_prep_base:
     extends: lepton_base
-    resource_shape: ${oc.env:LEPTON_CPU_SHAPE,cpu.medium}
+    resource_shape: ${oc.env:LEPTON_CPU_SHAPE,cpu.large}
     gpus_per_node: 0
     nprocs_per_node: 1
+    shared_memory_size: 4096
     startup_commands:
       - python -m pip install --quiet --break-system-packages cosmos-xenna protobuf
     env_vars:
-      SFT_OUTPUT_DIR: /mnt/lustre-shared/output/sft/packed
       RAY_RUNTIME_ENV_PYTHONPATH: /mnt/lustre-shared/_nemotron/src
 
+  lepton_prep_sft_packing:
+    extends: lepton_prep_base
+    env_vars:
+      SFT_OUTPUT_DIR: ${oc.env:SFT_OUTPUT_DIR,/mnt/lustre-shared/output/data_prep/sft_packing}
+      RAY_RUNTIME_ENV_PYTHONPATH: /mnt/lustre-shared/_nemotron/src
+
+  lepton_prep_pretrain_prep:
+    extends: lepton_prep_base
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: ${oc.env:PRETRAIN_OUTPUT_DIR,/mnt/lustre-shared/output/data_prep/pretrain_prep}
+      RAY_RUNTIME_ENV_PYTHONPATH: /mnt/lustre-shared/_nemotron/src
+
+  lepton_prep_rl_prep:
+    extends: lepton_prep_base
+    env_vars:
+      RL_OUTPUT_DIR: ${oc.env:RL_OUTPUT_DIR,/mnt/lustre-shared/output/data_prep/rl_prep}
+      RAY_RUNTIME_ENV_PYTHONPATH: /mnt/lustre-shared/_nemotron/src
+
+  # Curator-backed steps: byob/mcq, translate/nemo_curator, and curate all use the Curator
+  # container and install their Python runtime through the generic curator_runtime
+  # bootstrap, using requirements generated from pyproject/uv.lock at submission time.
+  lepton_curator_base:
+    extends: lepton_base
+    container_image: nvcr.io/nvidia/nemo-curator:26.02
+    resource_shape: ${oc.env:LEPTON_CPU_SHAPE,cpu.large}
+    nodes: 1
+    gpus_per_node: 0
+    nprocs_per_node: 1
+    shared_memory_size: 4096
+    pip_extras: []
+    env_vars:
+      RAY_worker_maximum_startup_concurrency: "16"
+      RAY_num_prestart_python_workers: "16"
+
+  # byob CPU smoke: keeps semantic dedup, coverage, and semantic outlier stages
+  # disabled. Those optional stages need the GPU/RAPIDS profile below.
+  lepton_byob_cpu:
+    extends: lepton_curator_base
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile byob -- python -m nemotron.steps.byob.mcq.step --config {config}
+    env_vars:
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+      NGC_API_KEY: ${oc.env:NGC_API_KEY,''}
+
+  # byob full: adds the heavy GPU stack required by Curator semantic dedup and
+  # related embedding/semantic quality stages.
+  lepton_byob_full:
+    extends: lepton_byob_cpu
+    resource_shape: ${oc.env:LEPTON_BYOB_GPU_SHAPE,gpu.a100-80gb}
+    gpus_per_node: 1
+    shared_memory_size: 8192
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile byob-gpu -- python -m nemotron.steps.byob.mcq.step --config {config}
+
+  # sdg/data_designer: CPU-only synthetic generation through remote model
+  # endpoints. Config variants choose SFT vs tool-call vs RL preference output.
+  lepton_sdg_data_designer:
+    extends: lepton_base
+    resource_shape: ${oc.env:LEPTON_CPU_SHAPE,cpu.large}
+    gpus_per_node: 0
+    nprocs_per_node: 1
+    shared_memory_size: 4096
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages data-designer==0.5.5
+    env_vars:
+      SDG_OUTPUT_DIR: ${oc.env:SDG_OUTPUT_DIR,/mnt/lustre-shared/output/sdg}
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+
+  lepton_sdg_data_designer_tiny:
+    extends: lepton_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: ${oc.env:SDG_OUTPUT_DIR,/mnt/lustre-shared/output/sdg/sft/tiny}
+
+  lepton_sdg_data_designer_default:
+    extends: lepton_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: ${oc.env:SDG_OUTPUT_DIR,/mnt/lustre-shared/output/sdg/sft/default}
+
+  lepton_sdg_data_designer_customer_support_tools:
+    extends: lepton_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: ${oc.env:SDG_OUTPUT_DIR,/mnt/lustre-shared/output/sdg/sft/customer_support_tools}
+
+  lepton_sdg_data_designer_rl_pref:
+    extends: lepton_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: ${oc.env:SDG_OUTPUT_DIR,/mnt/lustre-shared/output/sdg/rl/rl_pref}
+
+  # translate/nemo_curator: uses the translate Curator runtime profile.
+  lepton_translate:
+    extends: lepton_curator_base
+    resource_shape: ${oc.env:LEPTON_TRANSLATE_GPU_SHAPE,gpu.a100-80gb}
+    gpus_per_node: 1
+    shared_memory_size: 8192
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile translate -- python -m nemotron.steps.translate.nemo_curator.step --config {config}
+    env_vars:
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+      NGC_API_KEY: ${oc.env:NGC_API_KEY,''}
+
+  # curate/nemo_curator: runs directly in the Curator container. Keep this
+  # simple; translate/byob use curator_runtime because they need extra deps.
+  lepton_curate:
+    extends: lepton_curator_base
+    env_vars:
+      NEMOTRON_CURATOR_RAY_NUM_CPUS: "4"
+
   # pretrain/megatron_bridge: same NeMo image as SFT, with pretrain output wiring.
   lepton_pretrain_megatron_bridge:
     extends: lepton_base
     nodes: 2
     env_vars:
-      PRETRAIN_OUTPUT_DIR: /mnt/lustre-shared/output/pretrain/megatron_bridge
+      PRETRAIN_OUTPUT_DIR: ${oc.env:PRETRAIN_OUTPUT_DIR,/mnt/lustre-shared/output/pretrain/megatron_bridge}
+      PRETRAIN_BLEND_PATH: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/data_prep/pretrain_prep/blend.json}
+
+  # pretrain/automodel: AutoModel pretrain uses the AutoModel image/deps with
+  # the same pretrain data-prep output wiring.
+  lepton_pretrain_automodel:
+    extends: lepton_base
+    container_image: nvcr.io/nvidia/nemo-automodel:26.04
+    nodes: 2
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages omegaconf
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: ${oc.env:PRETRAIN_OUTPUT_DIR,/mnt/lustre-shared/output/pretrain/automodel}
+      PRETRAIN_BLEND_PATH: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/data_prep/pretrain_prep/blend.json}
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
 
   # sft/megatron_bridge: overrides only scale and output location.
   lepton_sft_megatron_bridge:
     extends: lepton_base
     nodes: 2
     env_vars:
-      SFT_OUTPUT_DIR: /mnt/lustre-shared/output/sft/megatron_bridge
+      SFT_OUTPUT_DIR: ${oc.env:SFT_OUTPUT_DIR,/mnt/lustre-shared/output/sft/megatron_bridge}
+      SFT_PACKED_DIR: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/data_prep/sft_packing/splits/train/*.parquet}
 
   # sft/automodel: demonstrates extending an individual step profile and changing image/deps.
   lepton_sft_automodel:
@@ -102,7 +262,30 @@ sections:
     startup_commands:
       - python -m pip install --quiet --break-system-packages omegaconf
     env_vars:
-      SFT_OUTPUT_DIR: /mnt/lustre-shared/output/sft/automodel
+      SFT_OUTPUT_DIR: ${oc.env:SFT_OUTPUT_DIR,/mnt/lustre-shared/output/sft/automodel}
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # peft/*: PEFT uses the same training backends as SFT but writes to PEFT paths.
+  lepton_peft_megatron_bridge:
+    extends: lepton_sft_megatron_bridge
+    env_vars:
+      SFT_OUTPUT_DIR: ${oc.env:SFT_OUTPUT_DIR,/mnt/lustre-shared/output/peft/megatron_bridge}
+      SFT_PACKED_DIR: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/data_prep/sft_packing/splits/train/*.parquet}
+
+  lepton_peft_automodel:
+    extends: lepton_sft_automodel
+    env_vars:
+      SFT_OUTPUT_DIR: ${oc.env:SFT_OUTPUT_DIR,/mnt/lustre-shared/output/peft/automodel}
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # convert/*: one shared GPU profile for HF<->Megatron conversion and LoRA merge/export.
+  lepton_convert_model:
+    extends: lepton_base
+    nodes: 1
+    env_vars:
+      CONVERT_OUTPUT_DIR: ${oc.env:CONVERT_OUTPUT_DIR,/mnt/lustre-shared/output/convert}
 
   # rl/nemo_rl/dpo: NeMo-RL image plus output override. Runtime RL knobs stay in step YAML.
   lepton_rl_nemo_rl_dpo:
@@ -134,3 +317,31 @@ sections:
     nodes: 2
     env_vars:
       OPTIM_OUTPUT_DIR: /mnt/lustre-shared/output/optimize/quantize
+
+  # optimize/modelopt/distill: same ModelOpt-capable image with distillation output wiring.
+  lepton_optimize_modelopt_distill:
+    extends: lepton_optimize_modelopt_quantize
+    env_vars:
+      OPTIM_OUTPUT_DIR: /mnt/lustre-shared/output/optimize/distill
+
+  # optimize/modelopt/prune: pruning can run on a smaller 2-GPU shape.
+  lepton_optimize_modelopt_prune:
+    extends: lepton_base
+    container_image: nvcr.io/nvidia/nemo:26.02
+    resource_shape: ${oc.env:LEPTON_PRUNE_GPU_SHAPE,gpu.2xa100-80gb}
+    gpus_per_node: 2
+    nprocs_per_node: 2
+    env_vars:
+      OPTIM_OUTPUT_DIR: /mnt/lustre-shared/output/optimize/prune
+
+  # eval/model_eval: lightweight profile for model evaluation steps.
+  lepton_eval_model_eval:
+    extends: lepton_base
+    nodes: 1
+    pip_extras:
+      - typer
+      - rich
+      - pydantic-settings
+      - nemo-evaluator-launcher
+    env_vars:
+      EVAL_OUTPUT_DIR: /mnt/lustre-shared/output/eval/model_eval
diff --git a/src/nemotron/steps/env/env_toml/config/slurm.yaml b/src/nemotron/steps/env/env_toml/config/slurm.yaml
index af1aecb6d..05f94f519 100644
--- a/src/nemotron/steps/env/env_toml/config/slurm.yaml
+++ b/src/nemotron/steps/env/env_toml/config/slurm.yaml
@@ -7,28 +7,67 @@ output_path: env.slurm.toml
 force: false
 
 preamble: |
-  # Generated by `nemotron step run env/env_toml -c slurm`.
+  # Generated by `nemotron steps run env/env_toml -c slurm`.
   # Keep secrets out of this file; pass tokens through ${oc.env:...} placeholders.
   #
   # Example usage after review:
   #   export NEMOTRON_ENV_FILE=env.slurm.toml
-  #   uv run nemotron step run prep/sft_packing -c default --batch slurm_prep_sft_packing
-  #   uv run nemotron step run pretrain/megatron_bridge -c default --batch slurm_pretrain_megatron_bridge
-  #   uv run nemotron step run sft/megatron_bridge -c default --batch slurm_sft_megatron_bridge
-  #   uv run nemotron step run rl/nemo_rl/dpo -c default --batch slurm_rl_nemo_rl_dpo
-  #   uv run nemotron step run rl/nemo_rl/rlvr -c default --batch slurm_rl_nemo_rl_rlvr
-  #   uv run nemotron step run rl/nemo_rl/rlhf -c default --batch slurm_rl_nemo_rl_rlhf
+  #   uv run nemotron steps run data_prep/sft_packing -c tiny --batch slurm_prep_sft_packing
+  #   uv run nemotron steps run data_prep/pretrain_prep -c tiny --batch slurm_prep_pretrain_prep
+  #   uv run nemotron steps run data_prep/rl_prep -c tiny --batch slurm_prep_rl_prep
+  #   uv run nemotron steps run pretrain/megatron_bridge -c tiny --batch slurm_pretrain_megatron_bridge
+  #   uv run nemotron steps run pretrain/automodel -c tiny --batch slurm_pretrain_automodel
+  #   uv run nemotron steps run sft/megatron_bridge -c tiny --batch slurm_sft_megatron_bridge
+  #   uv run nemotron steps run sft/automodel -c tiny --batch slurm_sft_automodel
+  #   uv run nemotron steps run peft/megatron_bridge -c tiny --batch slurm_peft_megatron_bridge
+  #   uv run nemotron steps run peft/automodel -c tiny --batch slurm_peft_automodel
+  #   uv run nemotron steps run convert/hf_to_megatron -c default --batch slurm_convert_model
+  #   uv run nemotron steps run convert/megatron_to_hf -c default --batch slurm_convert_model
+  #   uv run nemotron steps run convert/merge_lora -c default --batch slurm_convert_model
+  #   uv run nemotron steps run rl/nemo_rl/dpo -c tiny --batch slurm_rl_nemo_rl_dpo
+  #   uv run nemotron steps run rl/nemo_rl/rlvr -c tiny --batch slurm_rl_nemo_rl_rlvr
+  #   uv run nemotron steps run rl/nemo_rl/rlhf -c tiny --batch slurm_rl_nemo_rl_rlhf
+  #   uv run nemotron steps run optimize/modelopt/quantize -c tiny --batch slurm_optimize_modelopt_quantize
+  #   uv run nemotron steps run optimize/modelopt/distill -c tiny --batch slurm_optimize_modelopt_distill
+  #   uv run nemotron steps run optimize/modelopt/prune -c tiny --batch slurm_optimize_modelopt_prune
+  #   uv run nemotron steps run eval/model_eval -c tiny_chat --batch slurm_eval_model_eval
+  #   uv run nemotron steps run byob/mcq -c tiny --batch slurm_byob_cpu stage=all
+  #   uv run nemotron steps run byob/mcq -c tiny --batch slurm_byob_full stage=all
+  #   uv run nemotron steps run sdg/data_designer -c tiny --batch slurm_sdg_data_designer_tiny
+  #   uv run nemotron steps run translate/nemo_curator -c default --batch slurm_translate
+  #   uv run nemotron steps run curate/nemo_curator -c tiny --batch slurm_curate
 
 checks:
   required_profiles:
     - slurm_base
+    - slurm_curator_base
+    - slurm_prep_base
     - slurm_prep_sft_packing
+    - slurm_prep_pretrain_prep
+    - slurm_prep_rl_prep
     - slurm_pretrain_megatron_bridge
+    - slurm_pretrain_automodel
     - slurm_sft_megatron_bridge
+    - slurm_sft_automodel
+    - slurm_peft_megatron_bridge
+    - slurm_peft_automodel
+    - slurm_convert_model
     - slurm_rl_nemo_rl_dpo
     - slurm_rl_nemo_rl_rlvr
     - slurm_rl_nemo_rl_rlhf
     - slurm_optimize_modelopt_quantize
+    - slurm_optimize_modelopt_distill
+    - slurm_optimize_modelopt_prune
+    - slurm_eval_model_eval
+    - slurm_byob_cpu
+    - slurm_byob_full
+    - slurm_sdg_data_designer
+    - slurm_sdg_data_designer_tiny
+    - slurm_sdg_data_designer_default
+    - slurm_sdg_data_designer_customer_support_tools
+    - slurm_sdg_data_designer_rl_pref
+    - slurm_translate
+    - slurm_curate
 
 sections:
   wandb:
@@ -59,9 +98,9 @@ sections:
       PYTHONPATH: /nemo_run/code/src
       RAY_DEDUP_LOGS: "0"
 
-  # prep/sft_packing: data prep is CPU-only. Override the GPU base so Slurm
+  # data_prep/*: prep jobs are CPU-only. Override the GPU base so Slurm
   # does not submit to GPU partitions or request GPUs for the image import/job.
-  slurm_prep_sft_packing:
+  slurm_prep_base:
     extends: slurm_base
     partition: ${oc.env:SLURM_CPU_PARTITION,cpu}
     build_partition: ${oc.env:SLURM_CPU_PARTITION,cpu}
@@ -74,10 +113,130 @@ sections:
     startup_commands:
       - uv pip install --python /opt/venv/bin/python --quiet cosmos-xenna protobuf
     env_vars:
-      SFT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sft/packed
       RAY_RUNTIME_ENV_PYTHONPATH: /nemo_run/code/src
       RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
 
+  slurm_prep_sft_packing:
+    extends: slurm_prep_base
+    env_vars:
+      SFT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/data_prep/sft_packing
+      RAY_RUNTIME_ENV_PYTHONPATH: /nemo_run/code/src
+      RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
+
+  slurm_prep_pretrain_prep:
+    extends: slurm_prep_base
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: /lustre/${oc.env:USER}/output/data_prep/pretrain_prep
+      RAY_RUNTIME_ENV_PYTHONPATH: /nemo_run/code/src
+      RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
+
+  slurm_prep_rl_prep:
+    extends: slurm_prep_base
+    env_vars:
+      RL_OUTPUT_DIR: /lustre/${oc.env:USER}/output/data_prep/rl_prep
+      RAY_RUNTIME_ENV_PYTHONPATH: /nemo_run/code/src
+      RAY_MAX_LIMIT_FROM_API_SERVER: "100000"
+
+  # Curator-backed steps: byob/mcq, translate/nemo_curator, and curate all use the Curator
+  # container and install their Python runtime through the generic curator_runtime
+  # bootstrap, using requirements generated from pyproject/uv.lock at submission time.
+  slurm_curator_base:
+    extends: slurm_base
+    container_image: nvcr.io/nvidia/nemo-curator:26.02
+    partition: ${oc.env:SLURM_CPU_PARTITION,cpu}
+    build_partition: ${oc.env:SLURM_CPU_PARTITION,cpu}
+    build_include_gpus: false
+    build_gpus_per_node: 0
+    ntasks_per_node: 1
+    cpus_per_task: 4
+    gpus_per_node: 0
+    time: "01:00:00"
+
+  # byob CPU smoke: keeps semantic dedup, coverage, and semantic outlier stages
+  # disabled. Those optional stages need the GPU/RAPIDS profile below.
+  slurm_byob_cpu:
+    extends: slurm_curator_base
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile byob -- python -m nemotron.steps.byob.mcq.step --config {config}
+    env_vars:
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+      NGC_API_KEY: ${oc.env:NGC_API_KEY,''}
+
+  # byob full: adds the heavy GPU stack required by Curator semantic dedup and
+  # related embedding/semantic quality stages.
+  slurm_byob_full:
+    extends: slurm_byob_cpu
+    partition: ${oc.env:SLURM_GPU_PARTITION,${oc.env:SLURM_PARTITION,batch}}
+    build_partition: ${oc.env:SLURM_GPU_PARTITION,${oc.env:SLURM_PARTITION,batch}}
+    build_include_gpus: true
+    build_gpus_per_node: 1
+    ntasks_per_node: 1
+    cpus_per_task: 8
+    gpus_per_node: 1
+    time: "02:00:00"
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile byob-gpu -- python -m nemotron.steps.byob.mcq.step --config {config}
+
+  # sdg/data_designer: CPU-only synthetic generation through remote model
+  # endpoints. Config variants choose SFT vs tool-call vs RL preference output.
+  slurm_sdg_data_designer:
+    extends: slurm_base
+    container_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano
+    partition: ${oc.env:SLURM_CPU_PARTITION,cpu}
+    build_partition: ${oc.env:SLURM_CPU_PARTITION,cpu}
+    build_include_gpus: false
+    build_gpus_per_node: 0
+    ntasks_per_node: 1
+    cpus_per_task: 4
+    gpus_per_node: 0
+    time: "01:00:00"
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages data-designer==0.5.5
+    env_vars:
+      SDG_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sdg
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+
+  slurm_sdg_data_designer_tiny:
+    extends: slurm_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sdg/sft/tiny
+
+  slurm_sdg_data_designer_default:
+    extends: slurm_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sdg/sft/default
+
+  slurm_sdg_data_designer_customer_support_tools:
+    extends: slurm_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sdg/sft/customer_support_tools
+
+  slurm_sdg_data_designer_rl_pref:
+    extends: slurm_sdg_data_designer
+    env_vars:
+      SDG_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sdg/rl/rl_pref
+
+  # translate/nemo_curator: uses the translate Curator runtime profile.
+  slurm_translate:
+    extends: slurm_curator_base
+    partition: ${oc.env:SLURM_GPU_PARTITION,${oc.env:SLURM_PARTITION,batch}}
+    build_partition: ${oc.env:SLURM_GPU_PARTITION,${oc.env:SLURM_PARTITION,batch}}
+    build_include_gpus: true
+    build_gpus_per_node: 1
+    ntasks_per_node: 1
+    cpus_per_task: 8
+    gpus_per_node: 1
+    time: "02:00:00"
+    run_command: python -m nemotron.steps._bootstrap.curator_runtime --profile translate -- python -m nemotron.steps.translate.nemo_curator.step --config {config}
+    env_vars:
+      NVIDIA_API_KEY: ${oc.env:NVIDIA_API_KEY,''}
+      NGC_API_KEY: ${oc.env:NGC_API_KEY,''}
+
+  # curate/nemo_curator: runs directly in the Curator container. Keep this
+  # simple; translate/byob use curator_runtime because they need extra deps.
+  slurm_curate:
+    extends: slurm_curator_base
+    env_vars:
+      NEMOTRON_CURATOR_RAY_NUM_CPUS: "4"
+
   # pretrain/megatron_bridge: child overrides scale/time and pretrain output path.
   slurm_pretrain_megatron_bridge:
     extends: slurm_base
@@ -86,6 +245,23 @@ sections:
     time: "04:00:00"
     env_vars:
       PRETRAIN_OUTPUT_DIR: /lustre/${oc.env:USER}/output/pretrain/megatron_bridge
+      PRETRAIN_BLEND_PATH: /lustre/${oc.env:USER}/output/data_prep/pretrain_prep/blend.json
+
+  # pretrain/automodel: AutoModel pretrain uses the AutoModel image/deps with
+  # the same pretrain data-prep output wiring.
+  slurm_pretrain_automodel:
+    extends: slurm_base
+    container_image: nvcr.io/nvidia/nemo-automodel:26.04
+    nodes: 2
+    ntasks_per_node: 8
+    time: "04:00:00"
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages omegaconf
+    env_vars:
+      PRETRAIN_OUTPUT_DIR: /lustre/${oc.env:USER}/output/pretrain/automodel
+      PRETRAIN_BLEND_PATH: /lustre/${oc.env:USER}/output/data_prep/pretrain_prep/blend.json
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
 
   # sft/megatron_bridge: child overrides scale/time while inheriting site logistics.
   slurm_sft_megatron_bridge:
@@ -95,6 +271,40 @@ sections:
     time: "04:00:00"
     env_vars:
       SFT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sft/megatron_bridge
+      SFT_PACKED_DIR: /lustre/${oc.env:USER}/output/data_prep/sft_packing/splits/train/*.parquet
+
+  # sft/automodel: switches to the AutoModel image/deps.
+  slurm_sft_automodel:
+    extends: slurm_sft_megatron_bridge
+    container_image: nvcr.io/nvidia/nemo-automodel:26.04
+    startup_commands:
+      - python -m pip install --quiet --break-system-packages omegaconf
+    env_vars:
+      SFT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/sft/automodel
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # peft/*: PEFT uses the same training backends as SFT but writes to PEFT paths.
+  slurm_peft_megatron_bridge:
+    extends: slurm_sft_megatron_bridge
+    env_vars:
+      SFT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/peft/megatron_bridge
+      SFT_PACKED_DIR: /lustre/${oc.env:USER}/output/data_prep/sft_packing/splits/train/*.parquet
+
+  slurm_peft_automodel:
+    extends: slurm_sft_automodel
+    env_vars:
+      SFT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/peft/automodel
+      WANDB_INIT_TIMEOUT: ${oc.env:WANDB_INIT_TIMEOUT,300}
+      WANDB_HTTP_TIMEOUT: ${oc.env:WANDB_HTTP_TIMEOUT,300}
+
+  # convert/*: one shared GPU profile for HF<->Megatron conversion and LoRA merge/export.
+  slurm_convert_model:
+    extends: slurm_base
+    nodes: 1
+    time: "02:00:00"
+    env_vars:
+      CONVERT_OUTPUT_DIR: /lustre/${oc.env:USER}/output/convert
 
   # rl/nemo_rl/dpo: switch image for NeMo-RL while keeping tunnel/account/mounts.
   slurm_rl_nemo_rl_dpo:
@@ -128,3 +338,28 @@ sections:
     time: "04:00:00"
     env_vars:
       OPTIM_OUTPUT_DIR: /lustre/${oc.env:USER}/output/optimize/quantize
+
+  # optimize/modelopt/distill: same ModelOpt-capable image with distillation output wiring.
+  slurm_optimize_modelopt_distill:
+    extends: slurm_optimize_modelopt_quantize
+    env_vars:
+      OPTIM_OUTPUT_DIR: /lustre/${oc.env:USER}/output/optimize/distill
+
+  # optimize/modelopt/prune: pruning can run with fewer tasks than full-node train jobs.
+  slurm_optimize_modelopt_prune:
+    extends: slurm_base
+    container_image: nvcr.io/nvidia/nemo:26.02
+    nodes: 1
+    ntasks_per_node: 2
+    gpus_per_node: 2
+    time: "02:00:00"
+    env_vars:
+      OPTIM_OUTPUT_DIR: /lustre/${oc.env:USER}/output/optimize/prune
+
+  # eval/model_eval: lightweight profile for model evaluation steps.
+  slurm_eval_model_eval:
+    extends: slurm_base
+    nodes: 1
+    time: "02:00:00"
+    env_vars:
+      EVAL_OUTPUT_DIR: /lustre/${oc.env:USER}/output/eval/model_eval
diff --git a/src/nemotron/steps/env/env_toml/step.py b/src/nemotron/steps/env/env_toml/step.py
index ea6b75fce..10f437e66 100644
--- a/src/nemotron/steps/env/env_toml/step.py
+++ b/src/nemotron/steps/env/env_toml/step.py
@@ -151,6 +151,8 @@ def _toml_value(value: Any) -> str:
     if isinstance(value, str):
         return json.dumps(value)
     if isinstance(value, list):
+        if not value:
+            return "[]"
         if all(isinstance(item, dict) for item in value):
             return "[\n" + ",\n".join(f"    {_inline_table(item)}" for item in value) + ",\n]"
         return "[" + ", ".join(_toml_value(item) for item in value) + "]"
diff --git a/src/nemotron/steps/env/env_toml/step.toml b/src/nemotron/steps/env/env_toml/step.toml
index ec29553f0..542e4c5c7 100644
--- a/src/nemotron/steps/env/env_toml/step.toml
+++ b/src/nemotron/steps/env/env_toml/step.toml
@@ -2,8 +2,8 @@
 id = "env/env_toml"
 name = "Env TOML Generator"
 category = "env"
-description = """Generate and validate step-linked env profile examples from compact YAML templates for Lepton or Slurm, including inheritance, image overrides, mounts, env-var placeholders, and Ray/RL guardrails."""
-tags = ["env", "lepton", "slurm", "ray", "profiles", "toml"]
+description = """Generate and validate step-linked env profile examples from compact YAML templates for Lepton, Slurm, or DGX Cloud, including inheritance, image overrides, mounts, env-var placeholders, Curator/Data Designer profiles, and Ray/RL guardrails."""
+tags = ["env", "lepton", "slurm", "dgxcloud", "ray", "profiles", "toml", "curator", "sdg"]
 
 [[produces]]
 type = "env_toml"
@@ -12,7 +12,7 @@ description = "A generated env profile file containing reusable execution profil
 [[parameters]]
 name = "output_path"
 description = "Where to write the generated env profile file. Keep this at the repository root for normal use."
-default = "env.lepton.toml or env.slurm.toml from the selected config"
+default = "env.lepton.toml, env.slurm.toml, or env.dgxcloud.toml from the selected config"
 
 [[parameters]]
 name = "force"
@@ -25,11 +25,15 @@ description = "Ordered mapping of env profile sections to render. Each section m
 
 [[strategies]]
 when = "Creating a Lepton env profile from scratch"
-then = "Start from config/lepton.yaml, run from the repository root so output_path=env.lepton.toml lands beside pyproject.toml, set node_group, resource_shape, nemo_run_dir, workspace, and mount paths, then export NEMOTRON_ENV_FILE=env.lepton.toml before using the resulting profiles with `nemotron step run`."
+then = "Start from config/lepton.yaml, run from the repository root so output_path=env.lepton.toml lands beside pyproject.toml, set node_group, resource_shape, nemo_run_dir, workspace, and mount paths, then export NEMOTRON_ENV_FILE=env.lepton.toml before using the resulting profiles with `nemotron steps run`."
 
 [[strategies]]
 when = "Creating a Slurm env profile from scratch"
-then = "Start from config/slurm.yaml, run from the repository root so output_path=env.slurm.toml lands beside pyproject.toml, set host, user, account, partition, remote_job_dir, and mounts, then export NEMOTRON_ENV_FILE=env.slurm.toml before using the resulting profiles with `nemotron step run`."
+then = "Start from config/slurm.yaml, run from the repository root so output_path=env.slurm.toml lands beside pyproject.toml, set host, user, account, partition, remote_job_dir, and mounts, then export NEMOTRON_ENV_FILE=env.slurm.toml before using the resulting profiles with `nemotron steps run`."
+
+[[strategies]]
+when = "Creating a DGX Cloud env profile from scratch"
+then = "Start from config/dgxcloud.yaml, run from the repository root so output_path=env.dgxcloud.toml lands beside pyproject.toml, set Run:AI endpoint credentials, project/PVC values, shared workspace paths, and then export NEMOTRON_ENV_FILE=env.dgxcloud.toml."
 
 [[strategies]]
 when = "Configuring NeMo-RL or NeMo-Gym profiles"
@@ -52,10 +56,12 @@ name = "rlhf_genrm_server_missing"
 recovery = "Use a step-specific RLHF profile with enough physical nodes for policy/generation, NeMo-Gym GPU workers, and placement headroom. Keep source-transport cleanup in the runner and resource sizing in env.toml."
 
 [reference]
+skill = "src/nemotron/steps/env/env_toml/SKILL.md"
 runner = "src/nemotron/steps/env/env_toml/step.py"
 configs = [
   "src/nemotron/steps/env/env_toml/config/lepton.yaml",
   "src/nemotron/steps/env/env_toml/config/slurm.yaml",
+  "src/nemotron/steps/env/env_toml/config/dgxcloud.yaml",
 ]
 related = [
   "src/nemo_runspec/env.py",
diff --git a/src/nemotron/steps/eval/model_eval/SKILL.md b/src/nemotron/steps/eval/model_eval/SKILL.md
index 11fe2392d..32be2efc1 100644
--- a/src/nemotron/steps/eval/model_eval/SKILL.md
+++ b/src/nemotron/steps/eval/model_eval/SKILL.md
@@ -1,34 +1,33 @@
 ---
 name: nemotron-eval-model-eval
-description: Configure Nemotron eval/model_eval to deploy a trained checkpoint behind an OpenAI-compatible endpoint and run NeMo Evaluator benchmark suites. Use for chat/instruction benchmarks, log-probability tasks, reasoning-model evaluation, and producing consolidated eval_results from checkpoint_hf or checkpoint_megatron inputs.
+description: Configure Nemotron eval/model_eval to run NeMo Evaluator Launcher benchmark suites for hosted endpoints or Megatron Bridge checkpoints. Use for MMLU, HellaSwag, standard English benchmarks, container-backed tasks, sovereign or multilingual benchmark containers, and consolidated eval_results.
 ---
 
 # Model Evaluation (NeMo Evaluator)
 
-Use `eval/model_eval` to score a checkpoint on standard benchmarks. The step
-deploys the model behind an OpenAI-compatible endpoint, then NeMo Evaluator
-hits that endpoint with the benchmark suite.
+Use `eval/model_eval` to run NeMo Evaluator Launcher. This generic step follows
+the same launcher pattern as `nano3 eval` and `super3 eval`: compile YAML,
+strip Nemotron-only run metadata, then call `run_eval()`.
 
 Before changing configs, read `step.toml` end-to-end for the full
 strategies/errors/parameters list.
 
 ## Inputs and outputs
 
-- Consume **either** `checkpoint_megatron` (iter_* dir from Megatron-Bridge)
-  or `checkpoint_hf` (HF safetensors). Both are optional in the manifest, but
-  exactly one must be present at run time.
-- Produce `eval_results` — benchmark metrics + artifacts + run summary.
+- Consume a hosted endpoint config or `checkpoint_megatron`.
+- Produce `eval_results`: benchmark metrics, artifacts, logs, and optional W&B export.
 
 ## Configure
 
-- **Match endpoint type to benchmark family.** Chat/instruction → chat
-  endpoint. Log-probability (arc_challenge, hellaswag, piqa, etc.) → completions
-  endpoint with `logprobs` support.
-- **Tokenizer is required for log-probability tasks.** For `checkpoint_megatron`,
-  point at `checkpoint/tokenizer`. For `checkpoint_hf`, use the HF handle or path.
-- **Megatron deployments need the iter_* path**, not the parent output dir.
-- **Reasoning models** need higher `max_new_tokens`, reasoning-trace
-  processing on, and the temperature/top_p from the model card.
+- For hosted endpoints, use `config/tiny_chat.yaml`; it sets `deployment.type=none` and configures `target.api_endpoint`.
+- For Megatron checkpoint evaluation, start from `config/default.yaml`; it follows the Nano3/Super3 pattern where NeMo Evaluator Launcher owns deployment and benchmark execution.
+- Store only the API key environment variable name in YAML, usually `NVIDIA_API_KEY`.
+- Use exact model IDs returned by the serving provider when available.
+- Use exact task IDs from `nemo-evaluator-launcher ls tasks` or `nemo-evaluator-launcher ls task <task_id>`. Packaged tasks use IDs such as `mmlu_instruct`, `adlr_mmlu`, and `hellaswag`; do not prepend the harness name unless the task container exposes that exact dotted ID.
+- Match task choice to endpoint capability. Hosted NVIDIA Integrate smoke tests use `tiny_chat.yaml` with a chat-compatible task. Logprob benchmarks such as HellaSwag need a completions endpoint with logprobs and are not part of the hosted QA smoke path.
+- Start smoke tests with `limit_samples=1` before full MMLU or HellaSwag runs.
+- Megatron deployments need the concrete `iter_*` path, not the parent output dir.
+- Reasoning models often need higher `max_new_tokens` and model-card sampling defaults.
 - Reference [src/nemotron/steps/patterns/eval-before-and-after-training.md](../../patterns/eval-before-and-after-training.md)
   before treating any single eval as a result.
 
@@ -36,12 +35,33 @@ strategies/errors/parameters list.
 
 - Contract: [step.toml](step.toml)
 - Runner: [step.py](step.py)
-- Configs: `config/default.yaml`, `config/tiny.yaml`
+- Runtime helpers: [runtime.py](runtime.py)
+- Configs: `config/default.yaml` for Megatron checkpoint evaluation, `config/tiny_chat.yaml` for hosted chat smoke tests
+
+## Common Commands
+
+Hosted chat MMLU smoke:
+
+```bash
+export NEMO_EVALUATOR_MODEL_ID=<exact-model-id>
+export NEMO_EVALUATOR_MODEL_URL=<openai-compatible-chat-completions-endpoint-url>
+export NEMO_EVALUATOR_API_KEY_NAME=NVIDIA_API_KEY
+export NEMO_EVALUATOR_ENDPOINT_TYPE=chat
+
+uv run nemotron steps run eval/model_eval -c tiny_chat
+```
+
+Preview compiled launcher config without running:
+
+```bash
+uv run nemotron steps run eval/model_eval -c tiny_chat dry_run=true
+```
 
 ## Guardrails
 
 - Don't compare scores across different endpoint types or different
   generation settings.
-- Don't add `convert/megatron_to_hf` "just in case" — pick one input artifact
-  and configure the matching deployment path.
+- Don't add checkpoint conversion "just in case"; pick the artifact format and configure the matching deployment path.
+- Don't use the direct NeMo Evaluator API path for this step; use NeMo Evaluator Launcher.
+- Don't put raw API keys in YAML or command output.
 - Inspect a handful of generations before trusting aggregate metrics.
diff --git a/src/nemotron/steps/eval/model_eval/config/default.yaml b/src/nemotron/steps/eval/model_eval/config/default.yaml
index 2e7cd0457..65a87c9e2 100644
--- a/src/nemotron/steps/eval/model_eval/config/default.yaml
+++ b/src/nemotron/steps/eval/model_eval/config/default.yaml
@@ -1,25 +1,111 @@
-# Standard NeMo Evaluator starter config for a deployed model
-# Uses completions-style evaluation for standard NLU benchmarks.
+# Standard NeMo Evaluator Launcher config for Megatron checkpoint evaluation.
+#
+# This mirrors the Nano3/Super3 eval shape: the `run` section is used by
+# Nemotron for env/profile/artifact interpolation, then removed before handing
+# the config to NeMo Evaluator Launcher.
 
+dry_run: false
 output_dir: ./results
 
+run:
+  # Use a concrete Megatron Bridge iter_* checkpoint via
+  # `deployment.checkpoint_path=...`, or keep this as a W&B artifact reference
+  # consumed by `${art:model,path}`.
+  model: model:latest
+  env:
+    executor: local
+    container_image: nvcr.io/nvidia/nemo:25.11.nemotron_3_nano
+    host: ${oc.env:HOSTNAME,localhost}
+    user: ${oc.env:USER,''}
+    account: null
+    partition: null
+    remote_job_dir: ${oc.env:PWD}/.nemotron
+    time: "04:00:00"
+  wandb:
+    entity: null
+    project: null
+
+execution:
+  type: ${run.env.executor}
+  hostname: ${run.env.host}
+  username: ${run.env.user}
+  account: ${run.env.account}
+  partition: ${run.env.partition}
+  output_dir: ${output_dir}
+  walltime: ${run.env.time}
+  num_nodes: ${oc.select:run.env.nodes,1}
+  deployment:
+    n_tasks: ${execution.num_nodes}
+  auto_export:
+    destinations:
+      - wandb
+  env_vars:
+    deployment:
+      HF_HOME: ${run.env.remote_job_dir}/hf
+      HF_TOKEN: HF_TOKEN
+      NIM_CACHE_PATH: ${run.env.remote_job_dir}/nim
+      VLLM_CACHE_ROOT: ${run.env.remote_job_dir}/vllm
+    evaluation:
+      HF_HOME: ${run.env.remote_job_dir}/hf
+      HF_TOKEN: HF_TOKEN
+  mounts:
+    deployment: {}
+    evaluation: {}
+    mount_home: false
+
 deployment:
-  model_id: megatron_model
-  url: http://0.0.0.0:8080/v1/completions/
-  endpoint_type: completions
-  api_key_name: NGC_API_KEY
+  type: generic
+  image: ${run.env.container_image}
+  checkpoint_path: ${art:model,path}
+  multiple_instances: false
+  port: 1235
+  served_model_name: nemo-model
+  health_check_path: /v1/health
+  command: >-
+    bash -c 'export TRITON_CACHE_DIR=/tmp/triton_cache;
+    python /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_inframework.py
+    --megatron_checkpoint /checkpoint/
+    --num_gpus ${oc.select:run.env.gpus_per_node,1}
+    --tensor_model_parallel_size 1
+    --expert_model_parallel_size 1
+    --port 1235
+    --num_replicas 1'
+  endpoints:
+    chat: /v1/chat/completions/
+    completions: /v1/completions/
+    health: /v1/health
 
-benchmarks:
-  - mmlu
-  - hellaswag
-  - arc_challenge
+evaluation:
+  nemo_evaluator_config:
+    config:
+      params:
+        max_retries: 5
+        parallelism: 4
+        request_timeout: 6000
+        limit_samples: null
+        extra:
+          tokenizer: ${deployment.checkpoint_path}/tokenizer
+          tokenizer_backend: huggingface
+    target:
+      api_endpoint:
+        adapter_config:
+          output_dir: /results
+          use_progress_tracking: false
+          use_caching: true
+          caching_dir: /results/cache
+          use_response_logging: true
+          max_logged_responses: 10
+          use_request_logging: true
+          max_logged_requests: 10
+  tasks:
+    - name: adlr_mmlu
+      nemo_evaluator_config:
+        config:
+          params:
+            top_p: 0.0
+    - name: hellaswag
 
-params:
-  temperature: 0
-  top_p: 0
-  parallelism: 1
-  request_timeout: 3600
-  limit_samples: null
-  extra:
-    tokenizer: /path/to/checkpoint/tokenizer
-    tokenizer_backend: huggingface
+export:
+  wandb:
+    entity: ${run.wandb.entity}
+    project: ${run.wandb.project}
diff --git a/src/nemotron/steps/eval/model_eval/config/tiny.yaml b/src/nemotron/steps/eval/model_eval/config/tiny.yaml
deleted file mode 100644
index 51b44c26d..000000000
--- a/src/nemotron/steps/eval/model_eval/config/tiny.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Tiny smoke-test config for NeMo Evaluator
-
-output_dir: ./results-tiny
-
-deployment:
-  model_id: megatron_model
-  url: http://0.0.0.0:8080/v1/completions/
-  endpoint_type: completions
-  api_key_name: NGC_API_KEY
-
-benchmarks:
-  - hellaswag
-
-params:
-  temperature: 0
-  top_p: 0
-  parallelism: 1
-  request_timeout: 3600
-  limit_samples: 20
-  extra:
-    tokenizer: /path/to/checkpoint/tokenizer
-    tokenizer_backend: huggingface
diff --git a/src/nemotron/steps/eval/model_eval/config/tiny_chat.yaml b/src/nemotron/steps/eval/model_eval/config/tiny_chat.yaml
new file mode 100644
index 000000000..91c52b232
--- /dev/null
+++ b/src/nemotron/steps/eval/model_eval/config/tiny_chat.yaml
@@ -0,0 +1,51 @@
+# Tiny hosted chat endpoint smoke-test config.
+#
+# Export endpoint settings before running:
+#   export NEMO_EVALUATOR_MODEL_ID=<exact model id>
+#   export NEMO_EVALUATOR_MODEL_URL=<OpenAI-compatible chat completions endpoint URL>
+#   export NEMO_EVALUATOR_API_KEY_NAME=NVIDIA_API_KEY
+#   export NEMO_EVALUATOR_ENDPOINT_TYPE=chat
+
+dry_run: false
+output_dir: ./results-tiny-chat
+task_filters: null
+
+execution:
+  type: local
+  mode: sequential
+  output_dir: ${output_dir}
+
+deployment:
+  type: none
+
+target:
+  api_endpoint:
+    model_id: ${oc.env:NEMO_EVALUATOR_MODEL_ID,''}
+    url: ${oc.env:NEMO_EVALUATOR_MODEL_URL,''}
+    api_key_name: ${oc.env:NEMO_EVALUATOR_API_KEY_NAME,NVIDIA_API_KEY}
+    type: ${oc.env:NEMO_EVALUATOR_ENDPOINT_TYPE,chat}
+
+evaluation:
+  nemo_evaluator_config:
+    config:
+      params:
+        temperature: 0.0
+        top_p: 1.0
+        max_new_tokens: 1024
+        max_retries: 5
+        parallelism: 1
+        request_timeout: 3600
+        limit_samples: 1
+    target:
+      api_endpoint:
+        adapter_config:
+          output_dir: /results
+          use_progress_tracking: false
+          use_caching: true
+          caching_dir: /results/cache
+          use_response_logging: true
+          max_logged_responses: 5
+          use_request_logging: true
+          max_logged_requests: 5
+  tasks:
+    - name: mmlu_instruct
diff --git a/src/nemotron/steps/eval/model_eval/runtime.py b/src/nemotron/steps/eval/model_eval/runtime.py
new file mode 100644
index 000000000..59871eb8a
--- /dev/null
+++ b/src/nemotron/steps/eval/model_eval/runtime.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runtime helpers for `eval/model_eval`."""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+from omegaconf import DictConfig, OmegaConf
+
+from nemo_runspec.config import clear_artifact_cache, register_resolvers_from_config
+from nemo_runspec.evaluator import (
+    ensure_wandb_host_env,
+    get_non_task_args,
+    inject_wandb_env_mappings,
+    maybe_auto_squash_evaluator,
+    needs_wandb,
+    parse_task_flags,
+    save_eval_configs,
+)
+from nemotron.kit.train_script import (
+    apply_hydra_overrides,
+    load_omegaconf_yaml,
+    parse_config_and_overrides,
+)
+
+_STEP_ONLY_KEYS = {
+    "dry_run",
+    "output_dir",
+    "task_filters",
+}
+
+
+def run_model_eval(*, default_config: Path) -> None:
+    config_path, cfg, overrides = _load_config(default_config)
+    passthrough = _passthrough_args(overrides)
+    _validate_passthrough(passthrough)
+
+    launcher_cfg, dry_run, configured_tasks = _build_launcher_config(cfg)
+    task_filters = parse_task_flags(passthrough) or configured_tasks
+    eval_path = _save_launcher_config(config_path, cfg, launcher_cfg)
+
+    try:
+        from nemo_evaluator_launcher.api.functional import run_eval
+    except ImportError:
+        print("Error: nemo-evaluator-launcher is required for evaluation", file=sys.stderr)
+        print("Install with: uv sync --extra evaluator", file=sys.stderr)
+        raise SystemExit(1)
+
+    invocation_id = run_eval(launcher_cfg, dry_run=dry_run, tasks=task_filters)
+    print(f"launcher_config: {eval_path}")
+    if invocation_id:
+        print(f"launcher_invocation_id: {invocation_id}")
+        print(f"status_command: nemo-evaluator-launcher status {invocation_id}")
+        print(f"logs_command: nemo-evaluator-launcher logs {invocation_id}")
+
+
+def _load_config(default_config: Path) -> tuple[Path, DictConfig, list[str]]:
+    config_path, overrides = parse_config_and_overrides(default_config=default_config)
+    cfg = apply_hydra_overrides(load_omegaconf_yaml(config_path), overrides)
+    return Path(config_path), cfg, overrides
+
+
+def _build_launcher_config(cfg: DictConfig) -> tuple[DictConfig, bool, list[str] | None]:
+    dry_run = bool(cfg.get("dry_run", False))
+    output_dir = cfg.get("output_dir")
+    task_filters = cfg.get("task_filters")
+
+    _maybe_auto_squash(cfg, dry_run=dry_run)
+
+    if needs_wandb(cfg):
+        ensure_wandb_host_env()
+
+    clear_artifact_cache()
+    register_resolvers_from_config(cfg, artifacts_key="run", mode="pre_init")
+    launcher_dict = dict(OmegaConf.to_container(cfg, resolve=True))
+    launcher_dict.pop("run", None)
+    for key in _STEP_ONLY_KEYS:
+        launcher_dict.pop(key, None)
+
+    if output_dir:
+        launcher_dict.setdefault("execution", {})
+        launcher_dict["execution"].setdefault("output_dir", output_dir)
+
+    launcher_cfg = OmegaConf.create(launcher_dict)
+    if needs_wandb(launcher_cfg):
+        ensure_wandb_host_env()
+        inject_wandb_env_mappings(launcher_cfg)
+
+    return launcher_cfg, dry_run, list(task_filters) if task_filters else None
+
+
+def _passthrough_args(overrides: list[str]) -> list[str]:
+    """Return non-Hydra passthrough args from direct step.py invocation."""
+    return [arg for arg in overrides if arg != "--" and "=" not in arg]
+
+
+def _validate_passthrough(passthrough: list[str]) -> None:
+    extra_args = get_non_task_args(passthrough)
+    if extra_args:
+        print(
+            f"Error: Unknown arguments: {' '.join(extra_args)}\nOnly -t/--task flags are supported for passthrough.",
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
+
+
+def _maybe_auto_squash(cfg: DictConfig, *, dry_run: bool) -> None:
+    run = cfg.get("run")
+    if not isinstance(run, DictConfig):
+        return
+
+    mode = str(run.get("mode", "local"))
+    force_squash = bool(run.get("force_squash", False))
+    maybe_auto_squash_evaluator(
+        cfg,
+        mode=mode,
+        dry_run=dry_run,
+        force_squash=force_squash,
+    )
+
+
+def _save_launcher_config(
+    config_path: Path,
+    cfg: DictConfig,
+    launcher_cfg: DictConfig,
+) -> Path:
+    if config_path.name == "train.yaml":
+        eval_path = config_path.with_name("eval.yaml")
+    else:
+        _, eval_path = save_eval_configs(cfg, "eval/model_eval")
+
+    OmegaConf.save(launcher_cfg, eval_path)
+    return eval_path
diff --git a/src/nemotron/steps/eval/model_eval/step.py b/src/nemotron/steps/eval/model_eval/step.py
index 21fdc66f7..ffd172897 100644
--- a/src/nemotron/steps/eval/model_eval/step.py
+++ b/src/nemotron/steps/eval/model_eval/step.py
@@ -31,55 +31,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Thin NeMo Evaluator wrapper; see `Evaluator/docs/deployment/nemo-fw/_snippets/`."""
+"""Entry point for the generic NeMo Evaluator Launcher step."""
 
 from __future__ import annotations
 
 from pathlib import Path
 
-from nemo_evaluator.api import check_endpoint, evaluate
-from nemo_evaluator.api.api_dataclasses import ApiEndpoint, ConfigParams, EvaluationConfig, EvaluationTarget
-from omegaconf import OmegaConf
-
-from nemotron.kit.train_script import (
-    apply_hydra_overrides,
-    load_omegaconf_yaml,
-    parse_config_and_overrides,
-)
+from nemotron.steps.eval.model_eval.runtime import run_model_eval
 
 DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 
 
 def main() -> None:
-    config_path, overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG)
-    cfg = OmegaConf.to_container(
-        apply_hydra_overrides(load_omegaconf_yaml(config_path), overrides),
-        resolve=True,
-    )
-
-    endpoint = ApiEndpoint(
-        url=cfg["deployment"]["url"],
-        type=cfg["deployment"].get("endpoint_type", "completions"),
-        model_id=cfg["deployment"]["model_id"],
-        api_key=cfg["deployment"].get("api_key_name"),
-    )
-    target = EvaluationTarget(api_endpoint=endpoint)
-    params = ConfigParams(**cfg.get("params", {}))
-    check_endpoint(
-        endpoint_url=endpoint.url,
-        endpoint_type=endpoint.type,
-        model_name=endpoint.model_id,
-    )
-
-    for benchmark in cfg["benchmarks"]:
-        evaluate(
-            target_cfg=target,
-            eval_cfg=EvaluationConfig(
-                type=benchmark,
-                params=params,
-                output_dir=str(Path(cfg["output_dir"]) / benchmark),
-            ),
-        )
+    run_model_eval(default_config=DEFAULT_CONFIG)
 
 
 if __name__ == "__main__":
diff --git a/src/nemotron/steps/eval/model_eval/step.toml b/src/nemotron/steps/eval/model_eval/step.toml
index 36cb85475..4eca6bfda 100644
--- a/src/nemotron/steps/eval/model_eval/step.toml
+++ b/src/nemotron/steps/eval/model_eval/step.toml
@@ -16,7 +16,7 @@
 id = "eval/model_eval"
 name = "Model Evaluation (NeMo Evaluator)"
 category = "eval"
-description = """Deploy a trained checkpoint behind an OpenAI-compatible endpoint and run benchmark suites with NeMo Evaluator, producing consolidated evaluation results."""
+description = """Deploy a Megatron Bridge checkpoint behind an OpenAI-compatible endpoint, or evaluate an existing hosted endpoint, with NeMo Evaluator Launcher."""
 tags = ["eval", "benchmarks", "nemo-evaluator", "checkpoint", "launcher"]
 
 [[consumes]]
@@ -24,27 +24,62 @@ type = "checkpoint_megatron"
 description = "Megatron Bridge checkpoint directory, usually an iter_* directory, for deployment and evaluation."
 required = false
 
-[[consumes]]
-type = "checkpoint_hf"
-description = "Hugging Face checkpoint or model path for deployment and evaluation."
-required = false
-
 [[produces]]
 type = "eval_results"
 description = "Benchmark metrics, artifacts, and evaluation summaries produced by NeMo Evaluator."
 
 [[parameters]]
-name = "benchmarks"
-description = "Benchmark task names to run, using NeMo Evaluator or launcher task identifiers."
-default = ["lm-evaluation-harness.ifeval", "simple_evals.gpqa_diamond"]
+name = "evaluation.tasks"
+description = "Configured NeMo Evaluator Launcher task entries. Each entry uses an exact task ID from `nemo-evaluator-launcher ls tasks`."
+default = [{ name = "adlr_mmlu" }, { name = "hellaswag" }]
 
-[[strategies]]
-when = "The input artifact is checkpoint_hf"
-then = "Deploy with a Hugging Face-friendly backend such as vLLM or Ray Serve, then point NeMo Evaluator at the resulting chat or completions endpoint."
+[[parameters]]
+name = "target.api_endpoint.url"
+description = "OpenAI-compatible endpoint URL for a hosted model when deployment.type=none. Hosted chat smoke tests use config/tiny_chat.yaml."
+
+[[parameters]]
+name = "target.api_endpoint.model_id"
+description = "Exact hosted endpoint model id. Prefer ids returned by the endpoint's /v1/models API."
+
+[[parameters]]
+name = "target.api_endpoint.type"
+description = "Endpoint type expected by the evaluator task. Use chat for hosted tiny_chat smoke tests; use the deployed default config for checkpoint tasks."
+default = "completions"
+
+[[parameters]]
+name = "deployment.checkpoint_path"
+description = "Megatron Bridge checkpoint path passed to NeMo Evaluator Launcher's deployment config. Defaults to ${art:model,path}; override with an explicit iter_* path."
+
+[[parameters]]
+name = "deployment.image"
+description = "Container image used by NeMo Evaluator Launcher to host the checkpoint."
+
+[[parameters]]
+name = "evaluation.nemo_evaluator_config.config.params.extra.tokenizer"
+description = "Tokenizer path or Hugging Face id required by loglikelihood/logprob tasks such as HellaSwag. Use the evaluated model's tokenizer for real scores."
+
+[[parameters]]
+name = "target.api_endpoint.api_key_name"
+description = "Environment variable name that stores the endpoint key. Never put the raw key in config."
+default = "NVIDIA_API_KEY"
+
+[[parameters]]
+name = "evaluation.nemo_evaluator_config.config.params.limit_samples"
+description = "Optional per-task sample limit. The default Megatron checkpoint config runs the full task with null; config/tiny_chat.yaml sets this to 1 for hosted smoke testing."
+
+[[parameters]]
+name = "task_filters"
+description = "Optional task names passed to NeMo Evaluator Launcher to run a subset of configured tasks."
+default = []
+
+[[parameters]]
+name = "dry_run"
+description = "Pass dry_run=true to NeMo Evaluator Launcher. This is different from nemotron steps run --dry-run, which only compiles the Nemotron job config."
+default = false
 
 [[strategies]]
 when = "The input artifact is checkpoint_megatron"
-then = "Deploy the iter_* Megatron Bridge checkpoint with the Megatron-serving path and use the checkpoint tokenizer for log-probability benchmarks."
+then = "Use config/default.yaml and point deployment.checkpoint_path at the concrete iter_* Megatron Bridge checkpoint."
 
 [[strategies]]
 when = "The benchmark suite is chat or instruction-following oriented"
@@ -52,15 +87,25 @@ then = "Use a chat endpoint and start from deterministic generation settings unl
 
 [[strategies]]
 when = "The benchmark suite is log-probability based, such as arc_challenge, hellaswag, or piqa"
-then = "Use a completions endpoint with logprobs support and provide a matching tokenizer path or model handle."
+then = "Verify the selected task supports the endpoint capability and provide the evaluated model tokenizer. Some tasks require completions/logprobs rather than chat."
 
 [[strategies]]
 when = "You are evaluating a reasoning model"
 then = "Increase max_new_tokens, enable reasoning-trace processing, and use the model-card temperature and top-p defaults for reasoning tasks."
 
+[[strategies]]
+when = "The user wants a quick hosted endpoint validation"
+then = "Use config/tiny_chat.yaml with a chat-compatible hosted endpoint; keep limit_samples=1 for smoke tests."
+skill = "src/nemotron/steps/eval/model_eval/SKILL.md"
+
+[[strategies]]
+when = "You are selecting benchmark names"
+then = "Use exact task IDs from `nemo-evaluator-launcher ls tasks` or `nemo-evaluator-launcher ls task <task_id>`; do not prepend harness names unless that exact dotted ID is listed."
+skill = "src/nemotron/steps/eval/model_eval/SKILL.md"
+
 [[errors]]
 name = "missing_tokenizer_for_logprobs"
-recovery = "Provide tokenizer and tokenizer_backend for log-probability tasks, using checkpoint/tokenizer for Megatron Bridge or the HF tokenizer handle/path for checkpoint_hf."
+recovery = "Use a task/endpoint pairing that does not require logprobs, or provide a deployment endpoint and tokenizer supported by the selected Launcher task."
 
 [[errors]]
 name = "wrong_endpoint_type"
@@ -70,6 +115,11 @@ recovery = "Use chat endpoints for instruction/chat benchmarks and completions e
 name = "bad_megatron_checkpoint_path"
 recovery = "Point evaluation deployment at the specific iter_* checkpoint directory rather than only the parent output folder."
 
+[[errors]]
+name = "framework_cli_not_found"
+recovery = "Install the evaluator extra with uv sync --extra evaluator and run through the same environment so Launcher task CLIs are on PATH."
+
 [reference]
+skill = "src/nemotron/steps/eval/model_eval/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples/local_basic.yaml"
 docs = "https://docs.nvidia.com/nemo/evaluator/latest/get-started/quickstart/launcher.html"
diff --git a/src/nemotron/steps/index.py b/src/nemotron/steps/index.py
index 5473609c2..426f4f32a 100644
--- a/src/nemotron/steps/index.py
+++ b/src/nemotron/steps/index.py
@@ -25,19 +25,18 @@
 DEFAULT_STEPS_ROOT = Path(__file__).resolve().parent
 DEFAULT_PATTERNS_DIR = DEFAULT_STEPS_ROOT / "patterns"
 CATEGORY_TITLES = {
-    "benchmark": "Benchmarking",
     "byob": "Bring Your Own Benchmark",
     "convert": "Conversion",
     "curate": "Data Curation",
+    "data_prep": "Data Preparation",
+    "env": "Environment Profiles",
     "eval": "Evaluation",
     "optimize": "Model Optimization",
     "peft": "Parameter-Efficient Fine-Tuning",
-    "prep": "Data Preparation",
     "pretrain": "Pretraining",
     "rl": "Reinforcement Learning",
     "sft": "Supervised Fine-Tuning",
     "sdg": "Synthetic Data Generation",
-    "synth": "Synthetic Data Generation",
     "translate": "Translation",
 }
 VALID_PATTERN_CONFIDENCE = {"high", "medium", "experimental"}
diff --git a/src/nemotron/steps/optimize/SKILL.md b/src/nemotron/steps/optimize/SKILL.md
index 40352dbf6..7119d3006 100644
--- a/src/nemotron/steps/optimize/SKILL.md
+++ b/src/nemotron/steps/optimize/SKILL.md
@@ -36,7 +36,7 @@ three together.
 ```
 sft/automodel  → optimize/modelopt/quantize → eval/model_eval        # smaller serving footprint
 sft/automodel  → optimize/modelopt/prune    → optimize/modelopt/distill → eval/model_eval   # smaller architecture + quality recovery
-prep/pretrain_prep → optimize/modelopt/distill → eval/model_eval     # standalone distillation
+data_prep/pretrain_prep → optimize/modelopt/distill → eval/model_eval     # standalone distillation
 ```
 
 ## Pre-conditions
@@ -54,13 +54,14 @@ prep/pretrain_prep → optimize/modelopt/distill → eval/model_eval     # stand
 
 ## Workflow
 
-1. **Env profile first** — verify the env profile for Lepton/Slurm/Ray runs
-   (`env.toml` by default, or `NEMOTRON_ENV_FILE` for backend-specific files).
-2. Decide deployment hardware, serving stack, checkpoint format, and quality
+1. Decide deployment hardware, serving stack, checkpoint format, and quality
    budget **before** picking the step.
-3. Read the target step's `step.toml` and `config/default.yaml`.
-4. Smoke with `config/tiny.yaml` (quantize/prune) or
+2. Read the target step's `step.toml` and `config/default.yaml`.
+3. Smoke with `config/tiny.yaml` (quantize/prune) or
    `args.use_mock_data=true` (distill) — these prove the wrapper, not quality.
+4. For remote submission, select the profile from
+   `env/env_toml/config/{lepton,slurm,dgxcloud}.yaml` or the generated env file;
+   do not hardcode profile names here.
 5. Run the full job on representative calibration / distillation data.
 6. Convert the output if the next stage expects a different checkpoint format
    (`convert/megatron_to_hf` after quantize/distill if HF is needed).
@@ -69,9 +70,9 @@ prep/pretrain_prep → optimize/modelopt/distill → eval/model_eval     # stand
 ## Smoke commands
 
 ```bash
-nemotron step run optimize/modelopt/quantize -c tiny
-nemotron step run optimize/modelopt/prune -c tiny
-nemotron step run optimize/modelopt/distill -c tiny    # uses use_mock_data=true
+uv run nemotron steps run optimize/modelopt/quantize -c tiny --dry-run
+uv run nemotron steps run optimize/modelopt/prune -c tiny --dry-run
+uv run nemotron steps run optimize/modelopt/distill -c tiny --dry-run    # uses use_mock_data=true
 ```
 
 ## Guardrails
diff --git a/src/nemotron/steps/optimize/modelopt/distill/SKILL.md b/src/nemotron/steps/optimize/modelopt/distill/SKILL.md
index c12765238..4af7be2c3 100644
--- a/src/nemotron/steps/optimize/modelopt/distill/SKILL.md
+++ b/src/nemotron/steps/optimize/modelopt/distill/SKILL.md
@@ -12,7 +12,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 ## Inputs And Outputs
 
 - Consume teacher and student `checkpoint_hf` paths.
-- Optionally consume `binidx` data from `prep/pretrain_prep`.
+- Optionally consume `binidx` data from `data_prep/pretrain_prep`.
 - Produce `checkpoint_megatron`.
 - Validate launch and checkpoint writing before using real distillation data for quality evaluation.
 
@@ -20,6 +20,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Set `args.teacher_hf_path` and `args.student_hf_path`.
 - Set `args.data_paths` for real Megatron bin/idx distillation data.
+- Set `args.output_dir` away from teacher and student checkpoint roots.
 - Use `args.use_mock_data=true` only for launch validation.
 - Set `args.hf_export_path` when a Hugging Face export is needed directly.
 - Use `extra_args` for newly exposed upstream flags.
@@ -43,4 +44,5 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Use the original BF16 model as teacher when recovering from pruning or quantization.
 - Do not treat mock-data runs as quality validation.
+- Keep teacher, student, tokenizer, and distillation-data assumptions explicit.
 - Choose distillation data that matches the deployment domain.
diff --git a/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml b/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml
index f72f4c718..cef3979aa 100644
--- a/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml
+++ b/src/nemotron/steps/optimize/modelopt/distill/config/default.yaml
@@ -18,11 +18,11 @@
 # - distill.py teaches a student model from a larger/stronger teacher.
 # - It can run standalone or recover accuracy after pruning / quantization.
 # - Real-data runs expect pre-tokenized Megatron bin/idx data. Use
-#   prep/pretrain_prep first and pass the produced data prefixes via data_paths.
+#   data_prep/pretrain_prep first and pass the produced data prefixes via data_paths.
 #
 # Usage:
-#   nemotron step run optimize/modelopt/distill -c default
-#   nemotron step run optimize/modelopt/distill -c default args.use_mock_data=true args.train_iters=100
+#   nemotron steps run optimize/modelopt/distill -c default
+#   nemotron steps run optimize/modelopt/distill -c default args.use_mock_data=true args.train_iters=100
 #
 # Script arguments are forwarded from args: using underscore CLI flags. For
 # example, args.teacher_hf_path becomes --teacher_hf_path. New upstream flags can
diff --git a/src/nemotron/steps/optimize/modelopt/distill/step.toml b/src/nemotron/steps/optimize/modelopt/distill/step.toml
index 98f1ab760..f64a0b5a8 100644
--- a/src/nemotron/steps/optimize/modelopt/distill/step.toml
+++ b/src/nemotron/steps/optimize/modelopt/distill/step.toml
@@ -29,7 +29,7 @@ required = true
 
 [[consumes]]
 type = "binidx"
-description = "Optional real distillation data from prep/pretrain_prep. Not needed when use_mock_data=true."
+description = "Optional real distillation data from data_prep/pretrain_prep. Not needed when use_mock_data=true."
 required = false
 
 [[produces]]
@@ -48,11 +48,19 @@ description = "Student HF checkpoint path or model id."
 name = "args.data_paths"
 description = "Megatron data blend CLI sequence: [weight, prefix, ...]."
 
+[[parameters]]
+name = "args.output_dir"
+description = "Output directory for distilled Megatron checkpoints. Keep it distinct from teacher and student roots."
+
 [[parameters]]
 name = "args.use_mock_data"
 description = "Run a smoke test without real bin/idx data."
 default = false
 
+[[parameters]]
+name = "args.hf_export_path"
+description = "Optional HF export path when a standalone Hugging Face checkpoint is needed directly from distillation."
+
 [[parameters]]
 name = "extra_args"
 description = "Literal upstream args for newly added ModelOpt distillation flags."
@@ -70,7 +78,28 @@ then = "Set args.use_mock_data=true, args.seq_length=512, args.train_iters=100,
 when = "Need a HuggingFace checkpoint"
 then = "Set args.hf_export_path and args.student_hf_model, or convert a saved Megatron iteration later."
 
+[[strategies]]
+when = "Distilling after pruning"
+then = "Set args.student_hf_path to the pruned HF output and args.teacher_hf_path to the original full-precision checkpoint."
+
+[[errors]]
+name = "mock_data_used_as_quality_signal"
+recovery = "Use args.use_mock_data=true only for launch validation. Switch to real args.data_paths before judging quality."
+
+[[errors]]
+name = "teacher_student_or_tokenizer_mismatch"
+recovery = "Keep teacher, student, tokenizer, and distillation data assumptions explicit, especially after structural pruning."
+
+[[errors]]
+name = "output_dir_overlaps_checkpoint_roots"
+recovery = "Set args.output_dir away from args.teacher_hf_path and args.student_hf_path to avoid mixing inputs and outputs."
+
+[[errors]]
+name = "wandb_not_inherited"
+recovery = "Use the upstream distill W&B flags and pass project/entity/name through the run environment instead of wrapping the process."
+
 [reference]
+skill = "src/nemotron/steps/optimize/modelopt/distill/SKILL.md"
 modelopt_repo = "https://github.com/NVIDIA/Model-Optimizer"
 modelopt_readme = "https://github.com/NVIDIA/Model-Optimizer/blob/main/README.md"
 modelopt_docs = "https://nvidia.github.io/Model-Optimizer/"
@@ -78,4 +107,4 @@ modelopt_megatron_bridge_readme = "https://github.com/NVIDIA/Model-Optimizer/blo
 megatron_bridge_repo = "https://github.com/NVIDIA-NeMo/Megatron-Bridge"
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 megatron_bridge_distillation_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/distillation.html"
-skills = ["skills/nemotron-customize/context/modelopt-optimization.txt"]
+skills = ["skills/nemotron-customize/references/context/modelopt-optimization.txt"]
diff --git a/src/nemotron/steps/optimize/modelopt/prune/SKILL.md b/src/nemotron/steps/optimize/modelopt/prune/SKILL.md
index fc2510317..9f88506f6 100644
--- a/src/nemotron/steps/optimize/modelopt/prune/SKILL.md
+++ b/src/nemotron/steps/optimize/modelopt/prune/SKILL.md
@@ -13,10 +13,12 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Consume `checkpoint_hf`.
 - Produce pruned `checkpoint_hf`.
-- Smoke with `nemotron step run optimize/modelopt/prune -c tiny`.
+- Smoke with `nemotron steps run optimize/modelopt/prune -c tiny`.
 
 ## Configure
 
+- Set `args.hf_model_name_or_path` to a clean HF checkpoint and
+  `args.output_hf_path` to a fresh output directory.
 - Set `args.prune_target_params` when ModelOpt should search for a target budget.
 - Set `args.prune_export_config` when the deployment architecture is fixed.
 - Set `args.prune_target_params=null` when using a fixed export config.
@@ -42,4 +44,6 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Check pipeline-parallel and attention-head divisibility after pruning layer counts or hidden dimensions.
 - Distill the pruned model when quality recovery matters.
+- Keep the ModelOpt checkout and installed package in sync before debugging
+  wrapper-level changes.
 - Validate export and downstream loading before claiming the pruned artifact is usable.
diff --git a/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml b/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml
index be04398a1..4c2d1dae2 100644
--- a/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml
+++ b/src/nemotron/steps/optimize/modelopt/prune/config/default.yaml
@@ -25,8 +25,8 @@
 #   extra_args when using your own data.
 #
 # Usage:
-#   nemotron step run optimize/modelopt/prune -c default
-#   nemotron step run optimize/modelopt/prune -c default args.prune_target_params=6e9
+#   nemotron steps run optimize/modelopt/prune -c default
+#   nemotron steps run optimize/modelopt/prune -c default args.prune_target_params=6e9
 #
 # Script arguments are forwarded from args: using underscore CLI flags. For
 # example, args.prune_target_params becomes --prune_target_params. New upstream
diff --git a/src/nemotron/steps/optimize/modelopt/prune/step.toml b/src/nemotron/steps/optimize/modelopt/prune/step.toml
index c0bdc0f6e..172294ec4 100644
--- a/src/nemotron/steps/optimize/modelopt/prune/step.toml
+++ b/src/nemotron/steps/optimize/modelopt/prune/step.toml
@@ -31,6 +31,14 @@ required = true
 type = "checkpoint_hf"
 description = "Pruned HuggingFace checkpoint."
 
+[[parameters]]
+name = "args.hf_model_name_or_path"
+description = "HF model id or local HF checkpoint path to prune. Use a clean merged checkpoint."
+
+[[parameters]]
+name = "args.output_hf_path"
+description = "Output HF checkpoint path for the pruned model. Keep it outside the input checkpoint directory."
+
 [[parameters]]
 name = "args.prune_target_params"
 description = "Target parameter count for search, e.g. 6e9."
@@ -44,6 +52,10 @@ description = "Manual architecture dict such as hidden_size / ffn_hidden_size /
 name = "args.hparams_to_skip"
 description = "Architecture hparams to leave untouched, e.g. num_attention_heads."
 
+[[parameters]]
+name = "args.pp_size"
+description = "Pipeline parallel size used by the pruning script; layer counts and uneven-stage overrides must be compatible."
+
 [[parameters]]
 name = "extra_args"
 description = "Literal upstream args for newly added ModelOpt pruning flags."
@@ -61,7 +73,28 @@ then = "Set args.prune_export_config and set args.prune_target_params=null."
 when = "Layer count does not divide PP size"
 then = "Use args.num_layers_in_first_pipeline_stage / args.num_layers_in_last_pipeline_stage for uneven PP."
 
+[[strategies]]
+when = "Quality recovery matters"
+then = "Run optimize/modelopt/distill after pruning with the original BF16 checkpoint as teacher and the pruned HF output as student."
+
+[[errors]]
+name = "target_and_export_config_both_set"
+recovery = "Use args.prune_target_params for search, or set it to null when args.prune_export_config defines the exact target architecture."
+
+[[errors]]
+name = "output_path_overlaps_input"
+recovery = "Set args.output_hf_path outside args.hf_model_name_or_path because pruning writes intermediate state near the output."
+
+[[errors]]
+name = "modelopt_package_drift"
+recovery = "Keep the ModelOpt checkout and installed package in sync when errors mention missing utilities or mismatched scoring signatures."
+
+[[errors]]
+name = "invalid_parallel_divisibility"
+recovery = "Check attention heads, query groups, Mamba groups, and layer counts against pp/tp settings after pruning."
+
 [reference]
+skill = "src/nemotron/steps/optimize/modelopt/prune/SKILL.md"
 modelopt_repo = "https://github.com/NVIDIA/Model-Optimizer"
 modelopt_readme = "https://github.com/NVIDIA/Model-Optimizer/blob/main/README.md"
 modelopt_docs = "https://nvidia.github.io/Model-Optimizer/"
@@ -69,4 +102,4 @@ modelopt_megatron_bridge_readme = "https://github.com/NVIDIA/Model-Optimizer/blo
 megatron_bridge_repo = "https://github.com/NVIDIA-NeMo/Megatron-Bridge"
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 megatron_bridge_pruning_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/pruning.html"
-skills = ["skills/nemotron-customize/context/modelopt-optimization.txt"]
+skills = ["skills/nemotron-customize/references/context/modelopt-optimization.txt"]
diff --git a/src/nemotron/steps/optimize/modelopt/quantize/SKILL.md b/src/nemotron/steps/optimize/modelopt/quantize/SKILL.md
index b8ca78869..d7c274e11 100644
--- a/src/nemotron/steps/optimize/modelopt/quantize/SKILL.md
+++ b/src/nemotron/steps/optimize/modelopt/quantize/SKILL.md
@@ -17,10 +17,13 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Configure
 
+- Set `args.hf_model_id` to the HF checkpoint to quantize; merge LoRA inputs
+  before this step.
 - Use `config/fp8.yaml` for Hopper or H100 targets when FP8 is the intended serving format.
 - Use `config/nvfp4.yaml` for Blackwell or B200 targets when NVFP4 is supported by the model and serving stack.
 - Set `args.export_quant_cfg` to a value accepted by the installed upstream script: `int8_sq`, `fp8`, `fp8_blockwise`, `int4_awq`, `w4a8_awq`, or `nvfp4`.
 - Set `args.calib_size` high enough for representative activation ranges.
+- Keep `args.megatron_save_path` outside the source checkpoint path.
 - Use `extra_args` for new upstream ModelOpt or Megatron-Bridge flags.
 
 ## Config Nuances
diff --git a/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml b/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml
index 991f3ceed..2fd5e05d7 100644
--- a/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml
+++ b/src/nemotron/steps/optimize/modelopt/quantize/config/default.yaml
@@ -22,8 +22,8 @@
 #   Megatron-Bridge quantization scripts.
 #
 # Usage:
-#   nemotron step run optimize/modelopt/quantize -c default
-#   nemotron step run optimize/modelopt/quantize -c default args.export_quant_cfg=fp8
+#   nemotron steps run optimize/modelopt/quantize -c default
+#   nemotron steps run optimize/modelopt/quantize -c default args.export_quant_cfg=fp8
 #
 # Script arguments are forwarded from args: using hyphenated CLI flags. For
 # example, args.hf_model_id becomes --hf-model-id. New upstream flags can be
diff --git a/src/nemotron/steps/optimize/modelopt/quantize/step.toml b/src/nemotron/steps/optimize/modelopt/quantize/step.toml
index 50304d07d..b4e3075ae 100644
--- a/src/nemotron/steps/optimize/modelopt/quantize/step.toml
+++ b/src/nemotron/steps/optimize/modelopt/quantize/step.toml
@@ -31,6 +31,10 @@ required = true
 type = "checkpoint_megatron"
 description = "Quantized Megatron distributed checkpoint."
 
+[[parameters]]
+name = "args.hf_model_id"
+description = "HF model id or local HF checkpoint path to quantize. Use a clean, merged checkpoint rather than an adapter."
+
 [[parameters]]
 name = "args.export_quant_cfg"
 description = "ModelOpt/Megatron-Bridge quantization recipe: int8_sq, fp8, fp8_blockwise, int4_awq, w4a8_awq, or nvfp4."
@@ -41,6 +45,18 @@ name = "args.calib_size"
 description = "Number of calibration samples."
 default = 512
 
+[[parameters]]
+name = "args.megatron_save_path"
+description = "Output Megatron checkpoint path. Keep it outside the source checkpoint directory."
+
+[[parameters]]
+name = "args.tp"
+description = "Tensor parallel size. torchrun world size must be divisible by tp * pp and compatible with MoE/Mamba group counts."
+
+[[parameters]]
+name = "args.pp"
+description = "Pipeline parallel size. Use with args.tp to keep torchrun world size and model divisibility valid."
+
 [[parameters]]
 name = "extra_args"
 description = "Literal upstream args for newly added ModelOpt/Megatron-Bridge flags."
@@ -58,7 +74,28 @@ then = "Start with config/nvfp4.yaml and args.export_quant_cfg=nvfp4."
 when = "Need a HuggingFace checkpoint"
 then = "Export the produced Megatron checkpoint with /opt/Megatron-Bridge/examples/quantization/export.py."
 
+[[strategies]]
+when = "Calibrating for quality rather than launch validation"
+then = "Use representative prompts and a realistic calib_size; tiny configs only prove the wrapper starts."
+
+[[errors]]
+name = "unsupported_quant_recipe"
+recovery = "Use a recipe accepted by the installed upstream quantize.py --help, such as int8_sq, fp8, fp8_blockwise, int4_awq, w4a8_awq, or nvfp4."
+
+[[errors]]
+name = "world_size_not_divisible"
+recovery = "Adjust torchrun.nproc_per_node/nnodes or args.tp/args.pp so world size is divisible by tensor times pipeline parallelism."
+
+[[errors]]
+name = "source_checkpoint_overwritten"
+recovery = "Keep args.megatron_save_path separate from args.hf_model_id so failed exports do not corrupt the source model."
+
+[[errors]]
+name = "tiny_calibration_used_as_quality_signal"
+recovery = "Treat tiny calibration runs as launch validation only. Re-run with representative calibration data before evaluating quality."
+
 [reference]
+skill = "src/nemotron/steps/optimize/modelopt/quantize/SKILL.md"
 nemotron_quantization = "https://github.com/NVIDIA-NeMo/Nemotron/blob/main/docs/nemotron/super3/quantization.md"
 modelopt_repo = "https://github.com/NVIDIA/Model-Optimizer"
 modelopt_readme = "https://github.com/NVIDIA/Model-Optimizer/blob/main/README.md"
@@ -67,4 +104,4 @@ modelopt_megatron_bridge_readme = "https://github.com/NVIDIA/Model-Optimizer/blo
 megatron_bridge_repo = "https://github.com/NVIDIA-NeMo/Megatron-Bridge"
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 megatron_bridge_quantization_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/modelopt/quantization.html"
-skills = ["skills/nemotron-customize/context/modelopt-optimization.txt"]
+skills = ["skills/nemotron-customize/references/context/modelopt-optimization.txt"]
diff --git a/src/nemotron/steps/patterns/cpt-data-blend-scoping.md b/src/nemotron/steps/patterns/cpt-data-blend-scoping.md
index f337ed4c0..98a624561 100644
--- a/src/nemotron/steps/patterns/cpt-data-blend-scoping.md
+++ b/src/nemotron/steps/patterns/cpt-data-blend-scoping.md
@@ -7,7 +7,7 @@ triggers:
   - "A CPT blend mixes domain text (legal, medical, government, finance) with general web/Wikipedia data."
   - "The training token budget is being chosen for a continued-pretraining run."
   - "Catastrophic forgetting on general benchmarks is a concern after CPT."
-steps: [prep/pretrain_prep, pretrain/automodel, pretrain/megatron_bridge]
+steps: [data_prep/pretrain_prep, pretrain/automodel, pretrain/megatron_bridge]
 confidence: high
 ---
 
diff --git a/src/nemotron/steps/patterns/custom-mcq-benchmark-byob.md b/src/nemotron/steps/patterns/custom-mcq-benchmark-byob.md
index 95dd083e9..45ff22e31 100644
--- a/src/nemotron/steps/patterns/custom-mcq-benchmark-byob.md
+++ b/src/nemotron/steps/patterns/custom-mcq-benchmark-byob.md
@@ -6,8 +6,8 @@ triggers:
   - You need to create a benchmark from private or domain-specific documents.
   - The benchmark should use generated multiple-choice questions.
   - You need to translate a generated benchmark while preserving answer indexes.
-steps: [byob]
+steps: [byob/mcq]
 confidence: high
 ---
 
-Use `byob` when the request is to generate or translate benchmark artifacts, not training data.
+Use `byob/mcq` when the request is to generate or translate benchmark artifacts, not training data.
diff --git a/src/nemotron/steps/patterns/enable-faith-for-high-value-data.md b/src/nemotron/steps/patterns/enable-faith-for-high-value-data.md
index 528c90c18..47bb4f250 100644
--- a/src/nemotron/steps/patterns/enable-faith-for-high-value-data.md
+++ b/src/nemotron/steps/patterns/enable-faith-for-high-value-data.md
@@ -6,7 +6,7 @@ triggers:
   - "Translated data will be used for governance, audit, or high-value model training."
   - "The user needs quality scores or threshold filtering for translated corpus rows."
   - "Translation quality must gate SFT, CPT, or customer-facing training data."
-steps: [translate/translation]
+steps: [translate/nemo_curator]
 confidence: high
 ---
 
@@ -16,7 +16,7 @@ Use this when translated data quality needs evidence, not just output files. FAI
 
 ## What to do
 
-Enable `faith_eval.enabled=true`. Prefer `faith_eval.segment_level=true` for long corpora because scoring follows the same segment boundaries used by translation.
+Enable `faith_eval.enabled=true`. FAITH scoring follows the translated segments produced by the translation stage, then merges scores back onto the output records.
 
 Ask whether low-scoring rows should be filtered or only annotated. Set `faith_eval.filter_enabled` accordingly.
 
diff --git a/src/nemotron/steps/patterns/multilingual-tokenizer-check.md b/src/nemotron/steps/patterns/multilingual-tokenizer-check.md
index 2a4b8bc17..0888c1c2f 100644
--- a/src/nemotron/steps/patterns/multilingual-tokenizer-check.md
+++ b/src/nemotron/steps/patterns/multilingual-tokenizer-check.md
@@ -6,7 +6,7 @@ triggers:
   - "Training data includes non-English text or mixed-language prompts."
   - "You are adapting a mostly English base model to another language."
   - "The target language uses scripts, spacing rules, or morphology unlike English."
-steps: [prep/sft_packing, sft/megatron_bridge, sft/automodel]
+steps: [data_prep/sft_packing, sft/megatron_bridge, sft/automodel]
 confidence: high
 ---
 
diff --git a/src/nemotron/steps/patterns/pack-variable-length.md b/src/nemotron/steps/patterns/pack-variable-length.md
index faa9ab66a..a57deb834 100644
--- a/src/nemotron/steps/patterns/pack-variable-length.md
+++ b/src/nemotron/steps/patterns/pack-variable-length.md
@@ -1,12 +1,12 @@
 ---
 id: pack-variable-length
 title: "Pack variable-length SFT data"
-tags: [prep, sft, efficiency]
+tags: [data_prep, sft, efficiency]
 triggers:
   - "Training examples range from very short to very long sequences."
   - "GPU utilization is poor because padding dominates batches."
   - "You are preparing data for Megatron-Bridge SFT with packed inputs available."
-steps: [prep/sft_packing]
+steps: [data_prep/sft_packing]
 confidence: high
 ---
 
@@ -44,6 +44,6 @@ If the downstream trainer or evaluation path cannot respect boundaries or masks
 
 ## References
 
-- Most directly relevant to `prep/sft_packing` and downstream Megatron-Bridge SFT.
+- Most directly relevant to `data_prep/sft_packing` and downstream Megatron-Bridge SFT.
 - Sequence packing is often the highest-leverage efficiency improvement for heterogeneous chat corpora.
 - Revisit this pattern whenever the data mix changes substantially across customers or domains.
diff --git a/src/nemotron/steps/patterns/prefer-llm-for-structured-chat.md b/src/nemotron/steps/patterns/prefer-llm-for-structured-chat.md
index 78a5ce13f..7ccf25037 100644
--- a/src/nemotron/steps/patterns/prefer-llm-for-structured-chat.md
+++ b/src/nemotron/steps/patterns/prefer-llm-for-structured-chat.md
@@ -6,7 +6,7 @@ triggers:
   - "The input is OpenAI-style chat data, tool-calling transcripts, or nested message records."
   - "The translated output must preserve JSON, code blocks, markup, or message structure."
   - "The user wants to translate messages.*.content or another wildcard field path."
-steps: [translate/translation]
+steps: [translate/nemo_curator]
 confidence: high
 ---
 
diff --git a/src/nemotron/steps/patterns/prefer-nmt-for-large-corpora.md b/src/nemotron/steps/patterns/prefer-nmt-for-large-corpora.md
index 8b3268c20..021ba86a9 100644
--- a/src/nemotron/steps/patterns/prefer-nmt-for-large-corpora.md
+++ b/src/nemotron/steps/patterns/prefer-nmt-for-large-corpora.md
@@ -6,7 +6,7 @@ triggers:
   - "The corpus is large, mostly plain text, and a local NMT service is available."
   - "Translation throughput or cost matters more than nuanced instruction following."
   - "The user mentions an IndicTrans, NMT, or local translation server."
-steps: [translate/translation]
+steps: [translate/nemo_curator]
 confidence: high
 ---
 
diff --git a/src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md b/src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md
index 0a80444d6..6e35b9425 100644
--- a/src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md
+++ b/src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md
@@ -1,12 +1,12 @@
 ---
 id: prep-data-is-tokenizer-locked
 title: "Treat prepared data as tokenizer-locked"
-tags: [prep, tokenizer, data-artifacts]
+tags: [data_prep, tokenizer, data-artifacts]
 triggers:
   - "You are reusing packed Parquet or bin/idx data after changing the tokenizer, chat template, or sequence length."
   - "A downstream trainer reports shape, vocabulary, EOS, loss-mask, or data-prefix mismatches."
   - "You need to decide whether an existing prepared dataset is still compatible with a new training config."
-steps: [prep/sft_packing, prep/pretrain_prep, sft/megatron_bridge, peft/megatron_bridge, pretrain/automodel, pretrain/megatron_bridge]
+steps: [data_prep/sft_packing, data_prep/pretrain_prep, sft/megatron_bridge, peft/megatron_bridge, pretrain/automodel, pretrain/megatron_bridge]
 confidence: high
 ---
 
@@ -14,7 +14,7 @@ confidence: high
 
 Apply this whenever a training pipeline consumes materialized data artifacts rather than raw JSONL or text. Packed Parquet and bin/idx outputs are not generic datasets; they encode tokenizer, template, length, split, and sometimes loss-mask assumptions.
 
-This matters most at handoff points: `prep/sft_packing` into Megatron-Bridge SFT or PEFT, and `prep/pretrain_prep` into AutoModel or Megatron-Bridge pretraining.
+This matters most at handoff points: `data_prep/sft_packing` into Megatron-Bridge SFT or PEFT, and `data_prep/pretrain_prep` into AutoModel or Megatron-Bridge pretraining.
 
 Use it when a run changes model family, tokenizer path, chat template, sequence length, EOS handling, or role formatting. Any of those changes can invalidate data that otherwise still looks readable on disk.
 
@@ -42,6 +42,6 @@ If the downstream trainer reads raw JSONL directly, as AutoModel SFT and PEFT do
 - Pair with `multilingual-tokenizer-check` for non-English / mixed-language data — tokenizer choice affects pack_size and seq_length feasibility.
 - Pair with `sft-data-blending` — the prepared artifact captures the blend ratios; reshuffling means repacking.
 - Pair with `cpt-data-blend-scoping` — bin/idx blends must come from the same Nemotron release as the trainer.
-- Pair with `sdg-pipeline-versioning` when synthetic data feeds the prep step.
+- Pair with `sdg-pipeline-versioning` when synthetic data feeds the data_prep step.
 - Pair with `convert-checkpoint-safety` when a converter (e.g. `convert/megatron_to_hf`) sits between prep and the consumer.
 - This pattern explains many late training failures that originate in prep, not the trainer.
diff --git a/src/nemotron/steps/patterns/pretrain-token-budget-before-scale.md b/src/nemotron/steps/patterns/pretrain-token-budget-before-scale.md
index b7164a38b..9df1a1b1e 100644
--- a/src/nemotron/steps/patterns/pretrain-token-budget-before-scale.md
+++ b/src/nemotron/steps/patterns/pretrain-token-budget-before-scale.md
@@ -7,7 +7,7 @@ triggers:
   - "A pretrain config is being scaled from tiny/local execution to multi-GPU or multi-node execution."
   - "You need to choose between pretrain/automodel and pretrain/megatron_bridge."
   - "Cluster cost or wall-clock budget is being requested for a pretrain run."
-steps: [prep/pretrain_prep, pretrain/automodel, pretrain/megatron_bridge]
+steps: [data_prep/pretrain_prep, pretrain/automodel, pretrain/megatron_bridge]
 confidence: high
 ---
 
diff --git a/src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md b/src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md
index 82d0842b3..ac8baeb2e 100644
--- a/src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md
+++ b/src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md
@@ -6,7 +6,7 @@ triggers:
   - "An RLVR reward function, NeMo-Gym resource server, or learned reward model is being added."
   - "Reward is improving but held-out examples or human review look worse."
   - "A DPO, RLVR, or RLHF run is moving from tiny validation to production rollout counts."
-steps: [prep/rl_prep, rl/nemo_rl/dpo, rl/nemo_rl/rlvr, rl/nemo_rl/rlhf]
+steps: [data_prep/rl_prep, rl/nemo_rl/dpo, rl/nemo_rl/rlvr, rl/nemo_rl/rlhf]
 confidence: high
 ---
 
@@ -41,7 +41,7 @@ Some exploratory reward models are intentionally noisy. In that case, document t
 ## References
 
 - Pair with `eval-before-and-after-training` for pre/post alignment comparisons against task evals (not just reward).
-- Pair with `prep-data-is-tokenizer-locked` when RL data is sharded or materialized through `prep/rl_prep`.
+- Pair with `prep-data-is-tokenizer-locked` when RL data is sharded or materialized through `data_prep/rl_prep`.
 - Pair with `byob-benchmark-design` — RL alignment must be scored against a held-out benchmark the reward function never saw.
 - Pair with `sdg-pipeline-versioning` when synthetic preferences (Data Designer `rl_pref.yaml`) feed DPO.
 - Pair with `data-quality-before-quantity` — bad reward sources scale failure faster than good rewards scale success.
diff --git a/src/nemotron/steps/patterns/sdg-pipeline-versioning.md b/src/nemotron/steps/patterns/sdg-pipeline-versioning.md
index 1594e1d54..25415e720 100644
--- a/src/nemotron/steps/patterns/sdg-pipeline-versioning.md
+++ b/src/nemotron/steps/patterns/sdg-pipeline-versioning.md
@@ -7,7 +7,7 @@ triggers:
   - "A Data Designer config is moving from preview mode to a production-scale generation job."
   - "Generated data will feed SFT, DPO, RLVR, RLHF, or downstream data prep."
   - "A second SDG run needs to reproduce or extend an earlier corpus."
-steps: [sdg/data_designer, prep/sft_packing, prep/rl_prep, sft/automodel, sft/megatron_bridge, rl/nemo_rl/dpo]
+steps: [sdg/data_designer, data_prep/sft_packing, data_prep/rl_prep, sft/automodel, sft/megatron_bridge, rl/nemo_rl/dpo]
 confidence: high
 ---
 
@@ -33,7 +33,7 @@ A single git commit or single tagged directory is the cheapest way; a manifest f
 
 **Always preview before scaling.** Run with `--preview` (or `tiny.yaml`) until the projection schema is right. The quality bugs that show up in 10 records become expensive at 10,000 records.
 
-**Pin the output schema explicitly.** SFT data should project to OpenAI `messages` if downstream is `prep/sft_packing` or `sft/automodel`. DPO data must project to `{prompt, chosen, rejected}` for `prep/rl_prep` → `rl/nemo_rl/dpo`. Tool-use data uses `structured_messages` with `messages` + `tools`. Don't generate ambiguous schemas hoping a downstream consumer will untangle them.
+**Pin the output schema explicitly.** SFT data should project to OpenAI `messages` if downstream is `data_prep/sft_packing` or `sft/automodel`. DPO data must project to `{prompt, chosen, rejected}` for `data_prep/rl_prep` → `rl/nemo_rl/dpo`. Tool-use data uses `structured_messages` with `messages` + `tools`. Don't generate ambiguous schemas hoping a downstream consumer will untangle them.
 
 **Curate seed data deliberately.** Seeds are the single biggest lever on output diversity:
 - Representative of the target deployment audience.
diff --git a/src/nemotron/steps/patterns/sft-data-blending.md b/src/nemotron/steps/patterns/sft-data-blending.md
index 4c6d18daa..c2bc0a9f3 100644
--- a/src/nemotron/steps/patterns/sft-data-blending.md
+++ b/src/nemotron/steps/patterns/sft-data-blending.md
@@ -7,7 +7,7 @@ triggers:
   - "You are mixing translated/synthetic data with curated human-written data."
   - "Sovereign / regional SFT data is being blended with broader open-source instruction sets."
   - "After SFT the model loses one capability while gaining another."
-steps: [prep/sft_packing, sft/automodel, sft/megatron_bridge, peft/automodel, peft/megatron_bridge]
+steps: [data_prep/sft_packing, sft/automodel, sft/megatron_bridge, peft/automodel, peft/megatron_bridge]
 confidence: high
 ---
 
@@ -17,7 +17,7 @@ Apply this whenever SFT data comes from more than one source. Sovereign customiz
 
 This pattern matters most when capabilities trade against each other: a mostly-English instruction set will pull a target-language model back toward English; a tool-call-heavy blend can degrade chat fluency; a reasoning-heavy blend can hurt brevity on conversational tasks.
 
-Apply it before `prep/sft_packing`. The blend ratios decide what goes into the packed Parquet; reshuffling after packing means repacking.
+Apply it before `data_prep/sft_packing`. The blend ratios decide what goes into the packed Parquet; reshuffling after packing means repacking.
 
 ## What to do
 
@@ -31,7 +31,7 @@ Apply it before `prep/sft_packing`. The blend ratios decide what goes into the p
 
 **Keep capability slices balanced.** If reasoning, tool use, and chat each need to work, each capability should have enough rows to register against the blend. A 5K-row reasoning slice in a 500K-row corpus is a rounding error.
 
-**Translate, don't paraphrase, when localizing.** When mixing translated open-source data with target-language native data, run `translate/nemo_skills` with FAITH scoring (see the step's strategies) and keep faith ≥ 0.7. Low-faith translations dilute the language signal.
+**Translate, don't paraphrase, when localizing.** When mixing translated open-source data with target-language native data, run `translate/nemo_curator` with FAITH scoring (see the step's strategies) and keep faith ≥ 0.7. Low-faith translations dilute the language signal.
 
 **Validate the blend before packing.** Sample 100 rows proportional to the planned blend and inspect. If the sample doesn't look like what you want the model to do, the full blend won't either.
 
diff --git a/src/nemotron/steps/patterns/sft-sequence-packing.md b/src/nemotron/steps/patterns/sft-sequence-packing.md
index 00bbe3ba8..078c87b35 100644
--- a/src/nemotron/steps/patterns/sft-sequence-packing.md
+++ b/src/nemotron/steps/patterns/sft-sequence-packing.md
@@ -1,12 +1,12 @@
 ---
 id: sft-sequence-packing
 title: "Pack variable-length SFT data"
-tags: [prep, sft, efficiency]
+tags: [data_prep, sft, efficiency]
 triggers:
   - "Training examples range from very short to very long sequences."
   - "GPU utilization is poor because padding dominates batches."
   - "You are preparing data for Megatron-Bridge SFT with packed inputs available."
-steps: [prep/sft_packing]
+steps: [data_prep/sft_packing]
 confidence: high
 ---
 
diff --git a/src/nemotron/steps/patterns/translate-training-corpus.md b/src/nemotron/steps/patterns/translate-training-corpus.md
index 6c419a3c0..d9b0eba99 100644
--- a/src/nemotron/steps/patterns/translate-training-corpus.md
+++ b/src/nemotron/steps/patterns/translate-training-corpus.md
@@ -6,7 +6,7 @@ triggers:
   - "The user wants to translate a corpus, dataset, or chat records before CPT or SFT."
   - "Training data must be produced in a target language from source-language examples."
   - "A multilingual fine-tuning pipeline needs translated JSONL or Parquet artifacts."
-steps: [translate/translation]
+steps: [translate/nemo_curator]
 confidence: high
 ---
 
@@ -18,7 +18,7 @@ Do not apply it to benchmark-only translation. Benchmark translation has differe
 
 ## What to do
 
-Insert `translate/translation` before packing or training. Ask for explicit source and target language codes, the input format, and the field path to translate.
+Insert `translate/nemo_curator` before packing or training. Ask for explicit source and target language codes, the input format, and the field path to translate.
 
 Prefer Curator-native reader -> `TranslationStage` -> writer flow. Do not generate custom pandas chunking unless the user has one huge single file and Curator file partitioning is not enough.
 
diff --git a/src/nemotron/steps/peft/SKILL.md b/src/nemotron/steps/peft/SKILL.md
index 761b11663..18e762ed6 100644
--- a/src/nemotron/steps/peft/SKILL.md
+++ b/src/nemotron/steps/peft/SKILL.md
@@ -30,25 +30,26 @@ instruction-format adherence).
 ## Pipeline impact
 
 **If AutoModel:**
-- No prep step. Reads `training_jsonl` directly.
+- No data_prep step. Reads `training_jsonl` directly.
 - LoRA defaults: `peft.dim=8` or `16`, `peft.alpha ≈ 2 * peft.dim`.
 - Output is an HF-format adapter merged via `convert/merge_lora`.
 
 **If Megatron-Bridge:**
-- Add [`prep/sft_packing`](../prep/sft_packing/SKILL.md) upstream.
+- Add [`data_prep/sft_packing`](../data_prep/sft_packing/SKILL.md) upstream.
 - Requires a base `checkpoint_megatron` at `checkpoint.pretrained_checkpoint`.
 - Output is a Megatron-format adapter.
 
 ## Workflow
 
-1. **Env profile first** — verify the env profile for Lepton/Slurm/Ray runs
-   (`env.toml` by default, or `NEMOTRON_ENV_FILE` for backend-specific files).
-2. Pick backend per the decision tree above.
-3. Read the chosen step's `step.toml` for parameters/strategies/errors.
-4. Smoke-test with `config/tiny.yaml` before scaling.
-5. Keep base model + tokenizer + chat template identical to any later
+1. Pick backend per the decision tree above.
+2. Read the chosen step's `step.toml` for parameters/strategies/errors.
+3. Smoke-test with `config/tiny.yaml` before scaling.
+4. Keep base model + tokenizer + chat template identical to any later
    `sft/*` or `eval/*` consumer — see
    [../patterns/prep-data-is-tokenizer-locked.md](../patterns/prep-data-is-tokenizer-locked.md).
+5. For remote submission, select the profile from
+   `env/env_toml/config/{lepton,slurm,dgxcloud}.yaml` or the generated env file;
+   do not hardcode profile names here.
 6. Treat the adapter as a **separate artifact** until merge — see
    [../patterns/peft-adapter-merge-discipline.md](../patterns/peft-adapter-merge-discipline.md).
 7. Decide whether LoRA is even the right tool — see
@@ -61,10 +62,28 @@ instruction-format adherence).
 ## Smoke commands
 
 ```bash
-nemotron step run peft/automodel -c tiny
-nemotron step run peft/megatron_bridge -c tiny   # requires compatible packed_parquet + base checkpoint
+uv run nemotron steps run peft/automodel -c tiny --dry-run
+uv run nemotron steps run peft/megatron_bridge -c tiny --dry-run   # requires compatible packed_parquet + base checkpoint
 ```
 
+## Project layout for generated configs
+
+Keep every generated overlay config and any supporting code under a single
+self-contained project root that also holds the local input data, so the
+whole directory is rsync/scp-portable to the remote machine that will run
+the PEFT step.
+
+- `<project>/config/` for generated YAML — never write into
+  `src/nemotron/steps/peft/<backend>/config/`; the shipped `default.yaml`
+  and `tiny.yaml` stay as catalog references.
+- `<project>/data/` for local datasets, chat-format JSONL, and packed
+  Parquet splits referenced by the overlay.
+- Adapter output paths (`checkpoint_lora`) should resolve under the same
+  project root so the trained adapter ships with its provenance.
+- Project-root scripts only when catalog code cannot serve the request.
+- Do not split generated files into home dirs, scratch dirs, or paths
+  outside the project root that will not ship with the bundle.
+
 ## Guardrails
 
 - Keep LoRA rank low for tight memory; raise it only for harder tasks.
diff --git a/src/nemotron/steps/peft/automodel/SKILL.md b/src/nemotron/steps/peft/automodel/SKILL.md
index 9452b0a5e..2a8276ca8 100644
--- a/src/nemotron/steps/peft/automodel/SKILL.md
+++ b/src/nemotron/steps/peft/automodel/SKILL.md
@@ -17,6 +17,9 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Configure
 
+- Set `model.pretrained_model_name_or_path` and keep that exact base recorded
+  with the adapter for later merge.
+- Set `dataset.path_or_dataset_id` to chat-format JSONL.
 - Start with `peft.dim=8` or `16` on tight memory, then increase for harder tasks.
 - Keep `peft.alpha` near `2 * peft.dim` unless there is a reason to tune it.
 - Use smaller base models for single-GPU experiments.
@@ -38,6 +41,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Guardrails
 
-- Do not run `prep/sft_packing`; this step consumes JSONL directly.
+- Do not run `data_prep/sft_packing`; this step consumes JSONL directly.
 - Reduce rank and sequence length before changing the training wrapper for OOMs.
-- Treat the adapter as a separate artifact until merge and eval have passed.
+- Treat the adapter as a separate artifact until merge and eval have passed,
+  and preserve base/tokenizer/rank/alpha provenance with it.
diff --git a/src/nemotron/steps/peft/automodel/step.toml b/src/nemotron/steps/peft/automodel/step.toml
index b9fc29f32..a27cd1125 100644
--- a/src/nemotron/steps/peft/automodel/step.toml
+++ b/src/nemotron/steps/peft/automodel/step.toml
@@ -47,6 +47,14 @@ name = "meta-llama/Llama-3.1-8B-Instruct"
 description = "Common HF baseline for single-node LoRA."
 min_gpus = 2
 
+[[parameters]]
+name = "model.pretrained_model_name_or_path"
+description = "HF base model used for adapter training. Record it with the adapter because merge requires the same base."
+
+[[parameters]]
+name = "dataset.path_or_dataset_id"
+description = "Chat-format JSONL path or dataset id. This step consumes JSONL directly, not packed Parquet."
+
 [[parameters]]
 name = "peft.dim"
 description = "LoRA rank. 8-32 is the typical range; raise for harder tasks."
@@ -69,7 +77,16 @@ then = "After training, run convert/merge_lora to fold the adapter into the base
 name = "oom"
 recovery = "Reduce dim, lower seq_length, or move to a smaller base model."
 
+[[errors]]
+name = "base_not_recorded_for_merge"
+recovery = "Record model.pretrained_model_name_or_path, tokenizer, rank, alpha, and target modules with the adapter so convert/merge_lora can use the exact base."
+
+[[errors]]
+name = "packed_parquet_used_with_automodel"
+recovery = "Use the source training_jsonl or switch to peft/megatron_bridge if packed Parquet and Megatron checkpoints are required."
+
 [reference]
+skill = "src/nemotron/steps/peft/automodel/SKILL.md"
 repo = "https://github.com/NVIDIA-NeMo/Automodel"
 readme = "https://github.com/NVIDIA-NeMo/Automodel/blob/main/README.md"
 docs = "https://docs.nvidia.com/nemo/automodel/latest/index.html"
diff --git a/src/nemotron/steps/peft/megatron_bridge/SKILL.md b/src/nemotron/steps/peft/megatron_bridge/SKILL.md
index 03564cf9c..9af5deb75 100644
--- a/src/nemotron/steps/peft/megatron_bridge/SKILL.md
+++ b/src/nemotron/steps/peft/megatron_bridge/SKILL.md
@@ -11,7 +11,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Inputs And Outputs
 
-- Consume `packed_parquet` from `prep/sft_packing`.
+- Consume `packed_parquet` from `data_prep/sft_packing`.
 - Consume a base `checkpoint_megatron`.
 - Produce `checkpoint_lora`.
 - Validate with a short adapter run before scaling data, rank, or sequence length.
@@ -20,6 +20,9 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Keep `peft.type=lora`.
 - Start with the default `peft.dim`, then reduce it if memory is tight.
+- Set `checkpoint.pretrained_checkpoint` to a real Megatron checkpoint
+  directory and keep adapter outputs separate.
+- Set `load_hf_weights=false` for normal Megatron-checkpoint PEFT starts.
 - Keep packed-data tokenizer and sequence length aligned with the base model.
 - Merge or convert adapters when downstream consumers need HF model layout.
 - Check `src/nemotron/steps/patterns/prep-data-is-tokenizer-locked.md` before reusing packed data.
@@ -31,7 +34,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - Use `load_hf_weights: false` when PEFT starts from `checkpoint.pretrained_checkpoint`; use HF loading only when deliberately bootstrapping from HF weights.
 - Keep `model.sequence_parallel: true` when `model.tensor_model_parallel_size > 1` and MoE is enabled.
 - When checkpoint save reliability matters more than async throughput, prefer `checkpoint.async_save: false`, `checkpoint.fully_parallel_save: false`, `checkpoint.save_optim: false`, and `checkpoint.save_rng: false`.
-- `dataset.packed_sequence_specs.packed_train_data_path` should point at `splits/train/*.parquet` produced by `prep/sft_packing`.
+- `dataset.packed_sequence_specs.packed_train_data_path` should point at `splits/train/*.parquet` produced by `data_prep/sft_packing`.
 
 ## Local Files
 
@@ -41,6 +44,6 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Guardrails
 
-- Run `prep/sft_packing` first unless a compatible packed dataset already exists.
+- Run `data_prep/sft_packing` first unless a compatible packed dataset already exists.
 - Use `sft/megatron_bridge` instead when the user explicitly needs full fine-tuning.
 - Keep the base Megatron checkpoint path separate from adapter output paths.
diff --git a/src/nemotron/steps/peft/megatron_bridge/config/tiny.yaml b/src/nemotron/steps/peft/megatron_bridge/config/tiny.yaml
index fe7007ae3..f20b5f1f0 100644
--- a/src/nemotron/steps/peft/megatron_bridge/config/tiny.yaml
+++ b/src/nemotron/steps/peft/megatron_bridge/config/tiny.yaml
@@ -27,7 +27,7 @@ run:
 dataset:
   packed_sequence_specs:
     packed_sequence_size: 4096
-    packed_train_data_path: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/functional/prep/sft_packing/tiny/splits/train/*.parquet}
+    packed_train_data_path: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/functional/data_prep/sft_packing/tiny/splits/train/*.parquet}
 
 train:
   train_iters: 5
diff --git a/src/nemotron/steps/peft/megatron_bridge/step.toml b/src/nemotron/steps/peft/megatron_bridge/step.toml
index d3f916d40..2ac774dd9 100644
--- a/src/nemotron/steps/peft/megatron_bridge/step.toml
+++ b/src/nemotron/steps/peft/megatron_bridge/step.toml
@@ -19,7 +19,7 @@ category = "peft"
 description = """\
 Parameter-efficient fine-tuning (LoRA) on top of Megatron-Bridge. Useful when a
 full SFT exceeds memory but you still want TP/PP/CP scaling. Consumes packed
-Parquet from prep/sft_packing."""
+Parquet from data_prep/sft_packing."""
 tags = ["peft", "lora", "fine-tuning", "megatron", "distributed-training"]
 
 [[consumes]]
@@ -47,16 +47,46 @@ name = "peft.dim"
 description = "LoRA rank."
 default = 32
 
+[[parameters]]
+name = "checkpoint.pretrained_checkpoint"
+description = "Megatron base checkpoint directory to adapt. Required for PEFT; keep it separate from adapter output paths."
+
+[[parameters]]
+name = "load_hf_weights"
+description = "Set false when starting PEFT from checkpoint.pretrained_checkpoint. Use HF loading only when deliberately bootstrapping from HF weights."
+default = false
+
+[[parameters]]
+name = "dataset.packed_sequence_specs.packed_train_data_path"
+description = "Packed training Parquet glob, usually data_prep/sft_packing output_dir/splits/train/*.parquet."
+
 [[strategies]]
 when = "Full SFT does not fit in memory at the desired model size"
 then = "Switch to peft/megatron_bridge to keep TP/PP scaling but drop trainable params drastically."
 skill = "Megatron-Bridge/skills/perf-techniques/parallelism-strategies/SKILL.md"
 
+[[strategies]]
+when = "Checkpoint save reliability matters more than throughput"
+then = "Prefer checkpoint.async_save=false, checkpoint.fully_parallel_save=false, checkpoint.save_optim=false, and checkpoint.save_rng=false for adapter runs."
+
+[[strategies]]
+when = "The adapter must become an HF deployment artifact"
+then = "Plan the export/merge path up front: preserve the original base, convert layouts only when required, then run convert/merge_lora."
+
 [[errors]]
 name = "missing_packed_data"
-recovery = "Run prep/sft_packing first."
+recovery = "Run data_prep/sft_packing first."
+
+[[errors]]
+name = "missing_base_checkpoint"
+recovery = "Set checkpoint.pretrained_checkpoint to a real Megatron checkpoint directory and keep load_hf_weights=false for normal PEFT."
+
+[[errors]]
+name = "base_and_adapter_paths_mixed"
+recovery = "Keep the base Megatron checkpoint path, adapter save path, and later merged HF output path distinct."
 
 [reference]
+skill = "src/nemotron/steps/peft/megatron_bridge/SKILL.md"
 recipe = "src/nemotron/recipes/nano3/stage1_sft/"
 script = "src/nemotron/recipes/nano3/stage1_sft/train.py"
 megatron_bridge_repo = "https://github.com/NVIDIA-NeMo/Megatron-Bridge"
@@ -64,4 +94,4 @@ megatron_bridge_readme = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/ma
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 peft_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/peft.html"
 packed_sequences_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/packed-sequences.html"
-skills = ["skills/nemotron-customize/context/mbridge-sft.txt"]
+skills = ["skills/nemotron-customize/references/context/mbridge-sft.txt"]
diff --git a/src/nemotron/steps/prep/__init__.py b/src/nemotron/steps/prep/__init__.py
deleted file mode 100644
index 040c857ca..000000000
--- a/src/nemotron/steps/prep/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Preparation step category."""
diff --git a/src/nemotron/steps/prep/rl_prep/step.toml b/src/nemotron/steps/prep/rl_prep/step.toml
deleted file mode 100644
index cf6f1c91b..000000000
--- a/src/nemotron/steps/prep/rl_prep/step.toml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[step]
-id = "prep/rl_prep"
-name = "RL Data Prep (resolve + shard)"
-category = "prep"
-description = """\
-Resolve HuggingFace dataset references in an RL data blend and shard the
-output JSONL into the prompt / preference layout expected by rl/nemo_rl/*."""
-tags = ["prep", "rl", "dpo", "grpo", "jsonl"]
-
-[[consumes]]
-type = "training_jsonl"
-description = "RL data blend referencing HF or local prompt / preference datasets."
-
-[[produces]]
-type = "training_jsonl"
-description = "Sharded JSONL splits ready for rl/nemo_rl/{dpo,rlvr,rlhf}."
-
-[[parameters]]
-name = "num_shards_per_split"
-description = "Output shards per split."
-default = 1
-
-[[parameters]]
-name = "resolve_hf_placeholders"
-description = "If true, materialise HF placeholders into local JSONL (recommended for closed-network clusters)."
-default = true
-
-[[strategies]]
-when = "Cluster has no HF Hub access"
-then = "Keep resolve_hf_placeholders=true so all data is materialised locally before training."
-
-[[strategies]]
-when = "Producing data for RLVR / GRPO"
-then = "Ensure each prompt record has the verifiable answer field (e.g. 'answer' for math)."
-
-[reference]
-script = "src/nemotron/data_prep/recipes/rl.py"
-skills = ["skills/nemotron-customize/context/nemotron-data-prep.txt"]
diff --git a/src/nemotron/steps/pretrain/SKILL.md b/src/nemotron/steps/pretrain/SKILL.md
index 1fbcc4787..19a7caec4 100644
--- a/src/nemotron/steps/pretrain/SKILL.md
+++ b/src/nemotron/steps/pretrain/SKILL.md
@@ -20,7 +20,7 @@ The "default model" column shows what the shipped `config/default.yaml`
 selects. Override at CLI:
 
 ```bash
-nemotron step run pretrain/automodel -c default \
+uv run nemotron steps run pretrain/automodel -c default --dry-run \
   model.pretrained_model_name_or_path=<your-hf-id>
 ```
 
@@ -50,7 +50,7 @@ mandatory blend-with-general-data discipline.
 
 ## Pre-conditions
 
-1. **Compatible bin/idx data** from [`prep/pretrain_prep`](../prep/pretrain_prep/SKILL.md).
+1. **Compatible bin/idx data** from [`data_prep/pretrain_prep`](../data_prep/pretrain_prep/SKILL.md).
    `blend.json` is the trainer's entry — its tokenizer must match the model's.
 2. **A documented token budget** (target_tokens, seq_length, gbs, train_iters,
    lr schedule, ckpt cadence). See [../patterns/pretrain-token-budget-before-scale.md](../patterns/pretrain-token-budget-before-scale.md).
@@ -61,22 +61,23 @@ mandatory blend-with-general-data discipline.
 ## Pipeline placement
 
 ```
-curate/nemo_curator → prep/pretrain_prep → pretrain/automodel        → checkpoint_hf
-                                          → pretrain/megatron_bridge → checkpoint_megatron
+curate/nemo_curator → data_prep/pretrain_prep → pretrain/automodel        → checkpoint_hf
+                                              → pretrain/megatron_bridge → checkpoint_megatron
                                                                        (then convert/megatron_to_hf if HF needed downstream)
 ```
 
 ## Workflow
 
-1. **Env profile first** — verify the env profile for Lepton/Slurm/Ray runs
-   (`env.toml` by default, or `NEMOTRON_ENV_FILE` for backend-specific files).
-2. Run [`prep/pretrain_prep`](../prep/pretrain_prep/SKILL.md) on a tokenizer
+1. Run [`data_prep/pretrain_prep`](../data_prep/pretrain_prep/SKILL.md) on a tokenizer
    that matches the trainer.
-3. Write the budget down (target_tokens / seq_length / gbs / train_iters /
+2. Write the budget down (target_tokens / seq_length / gbs / train_iters /
    lr schedule / ckpt cadence) **before code changes**.
-4. Pick backend per the decision tree.
-5. Smoke with `config/tiny.yaml` to verify launch + data access + checkpoint
+3. Pick backend per the decision tree.
+4. Smoke with `config/tiny.yaml` to verify launch + data access + checkpoint
    write/restore.
+5. For remote submission, select the profile from
+   `env/env_toml/config/{lepton,slurm,dgxcloud}.yaml` or the generated env file;
+   do not hardcode profile names here.
 6. Run a short *representative* job at production sequence length and
    parallelism to validate throughput and val-loss movement.
 7. For CPT, evaluate at every checkpoint to catch forgetting early.
@@ -88,10 +89,28 @@ curate/nemo_curator → prep/pretrain_prep → pretrain/automodel        → che
 ## Smoke commands
 
 ```bash
-nemotron step run pretrain/automodel       -c tiny
-nemotron step run pretrain/megatron_bridge -c tiny
+uv run nemotron steps run pretrain/automodel       -c tiny --dry-run
+uv run nemotron steps run pretrain/megatron_bridge -c tiny --dry-run
 ```
 
+## Project layout for generated configs
+
+Keep every generated overlay config and any supporting code under a single
+self-contained project root that also holds the local input data, so the
+whole directory is rsync/scp-portable to the remote machine that will run
+the pretrain step.
+
+- `<project>/config/` for generated YAML — never write into
+  `src/nemotron/steps/pretrain/<backend>/config/`; the shipped
+  `default.yaml` and `tiny.yaml` stay as catalog references.
+- `<project>/data/` for the bin/idx shards and the `blend.json` emitted by
+  `data_prep/pretrain_prep`, plus any held-out validation slices.
+- Keep checkpoint save dirs and budget/lr-schedule notes under the same
+  project root so the run is reproducible after a remote transfer.
+- Project-root scripts only when catalog code cannot serve the request.
+- Do not split generated files into home dirs, scratch dirs, or paths
+  outside the project root that will not ship with the bundle.
+
 ## Patterns to cite
 
 - [../patterns/pretrain-token-budget-before-scale.md](../patterns/pretrain-token-budget-before-scale.md) — budget contract before scaling.
diff --git a/src/nemotron/steps/pretrain/automodel/SKILL.md b/src/nemotron/steps/pretrain/automodel/SKILL.md
index 3f9098a7e..95909bd04 100644
--- a/src/nemotron/steps/pretrain/automodel/SKILL.md
+++ b/src/nemotron/steps/pretrain/automodel/SKILL.md
@@ -11,7 +11,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Inputs And Outputs
 
-- Consume `binidx` data from `prep/pretrain_prep`.
+- Consume `binidx` data from `data_prep/pretrain_prep`.
 - Produce `checkpoint_hf`.
 - Validate data loading and checkpoint output with a short run before scaling token budget.
 
@@ -19,6 +19,10 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Set `model.pretrained_model_name_or_path` for continued pretraining from an HF base.
 - Set `load_weights=false` only when intentionally training from scratch.
+- Set `dataset.paths` and `validation_dataset.paths` to the
+  data_prep-emitted `blend.json`.
+- Keep `dataset.seq_length`, `validation_dataset.seq_length`, and model
+  context aligned.
 - Keep tokenizer and vocab settings aligned with the bin/idx artifact.
 - Use launcher and executor settings from the AutoModel runner for cluster moves.
 - Check `src/nemotron/steps/patterns/pretrain-token-budget-before-scale.md` before changing pretraining strategy.
diff --git a/src/nemotron/steps/pretrain/automodel/config/tiny.yaml b/src/nemotron/steps/pretrain/automodel/config/tiny.yaml
index c6734f2fd..975b07660 100644
--- a/src/nemotron/steps/pretrain/automodel/config/tiny.yaml
+++ b/src/nemotron/steps/pretrain/automodel/config/tiny.yaml
@@ -31,14 +31,14 @@ checkpoint:
   checkpoint_dir: ${oc.env:PRETRAIN_OUTPUT_DIR,./output/pretrain-automodel-tiny}/checkpoints
 
 dataset:
-  paths: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/functional/prep/pretrain_idxbin/tiny/blend.json}
+  paths: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/functional/data_prep/pretrain_prep/tiny/blend.json}
   index_mapping_dir: ${oc.env:PRETRAIN_OUTPUT_DIR,./output/pretrain-automodel-tiny}/index_mapping/train
   seq_length: 1024
   num_train_samples: 128
   trainer_max_steps: 5
 
 validation_dataset:
-  paths: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/functional/prep/pretrain_idxbin/tiny/blend.json}
+  paths: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/functional/data_prep/pretrain_prep/tiny/blend.json}
   index_mapping_dir: ${oc.env:PRETRAIN_OUTPUT_DIR,./output/pretrain-automodel-tiny}/index_mapping/validation
   seq_length: 1024
   num_val_samples: 16
diff --git a/src/nemotron/steps/pretrain/automodel/step.toml b/src/nemotron/steps/pretrain/automodel/step.toml
index 350aa25be..a04526074 100644
--- a/src/nemotron/steps/pretrain/automodel/step.toml
+++ b/src/nemotron/steps/pretrain/automodel/step.toml
@@ -23,7 +23,7 @@ tags = ["pretrain", "cpt", "automodel", "huggingface"]
 
 [[consumes]]
 type = "binidx"
-description = "Megatron bin/idx tokenized text shards (from prep/pretrain_prep)."
+description = "Megatron bin/idx tokenized text shards (from data_prep/pretrain_prep)."
 required = true
 
 [[produces]]
@@ -41,6 +41,23 @@ name = "model.pretrained_model_name_or_path"
 description = "HF base for CPT, or pass a config-only path to train from scratch."
 default = "Qwen/Qwen3-30B-A3B"
 
+[[parameters]]
+name = "load_weights"
+description = "True for continued pretraining from an HF base; false only when intentionally training from scratch."
+default = true
+
+[[parameters]]
+name = "dataset.paths"
+description = "Path to data_prep/pretrain_prep emitted blend.json for training bin/idx data."
+
+[[parameters]]
+name = "validation_dataset.paths"
+description = "Path to the validation blend.json or validation portion emitted by pretrain prep."
+
+[[parameters]]
+name = "dataset.seq_length"
+description = "Sequence length used by MegatronPretraining data loader; keep aligned with validation_dataset.seq_length and model context."
+
 [[strategies]]
 when = "Continued pretraining on a domain corpus"
 then = "Set load_weights=true and lower the learning rate (1e-5 to 5e-5)."
@@ -49,7 +66,20 @@ then = "Set load_weights=true and lower the learning rate (1e-5 to 5e-5)."
 when = "Pretraining from scratch"
 then = "Set load_weights=false and use a warmup + cosine schedule sized to the token budget."
 
+[[errors]]
+name = "missing_blend_json"
+recovery = "Run data_prep/pretrain_prep first and set dataset.paths / validation_dataset.paths to the emitted blend.json."
+
+[[errors]]
+name = "tokenizer_or_sequence_mismatch"
+recovery = "Rebuild bin/idx if tokenizer assumptions change, and keep dataset.seq_length, validation_dataset.seq_length, and model context aligned."
+
+[[errors]]
+name = "unsupported_scheduler_kwargs"
+recovery = "Use lr_scheduler.lr_decay_style and min_lr with containers whose OptimizerParamScheduler does not accept warmup_steps."
+
 [reference]
+skill = "src/nemotron/steps/pretrain/automodel/SKILL.md"
 repo = "https://github.com/NVIDIA-NeMo/Automodel"
 readme = "https://github.com/NVIDIA-NeMo/Automodel/blob/main/README.md"
 docs = "https://docs.nvidia.com/nemo/automodel/latest/index.html"
@@ -57,4 +87,4 @@ pretrain_guide = "https://docs.nvidia.com/nemo/automodel/latest/guides/llm/nanog
 recipes_overview = "https://docs.nvidia.com/nemo/automodel/latest/guides/overview.html"
 examples = "https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_pretrain"
 recipe_script = "https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/train_ft.py"
-skills = ["skills/nemotron-customize/context/automodel-pretrain.txt"]
+skills = ["skills/nemotron-customize/references/context/automodel-pretrain.txt"]
diff --git a/src/nemotron/steps/pretrain/megatron_bridge/SKILL.md b/src/nemotron/steps/pretrain/megatron_bridge/SKILL.md
index 1974a14b6..5cf238b49 100644
--- a/src/nemotron/steps/pretrain/megatron_bridge/SKILL.md
+++ b/src/nemotron/steps/pretrain/megatron_bridge/SKILL.md
@@ -11,7 +11,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Inputs And Outputs
 
-- Consume `binidx` data and `blend.json` from `prep/pretrain_prep`.
+- Consume `binidx` data and `blend.json` from `data_prep/pretrain_prep`.
 - Optionally initialize from a base checkpoint or HF weights for continued pretraining.
 - Produce `checkpoint_megatron`.
 - Validate data loading, parallelism, and checkpoint output with a short run before scaling token budget.
@@ -19,6 +19,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 ## Configure
 
 - Keep `seq_length` aligned with the data and token budget.
+- Set `dataset.data_paths` to the data_prep/pretrain_prep emitted `blend.json`.
 - Set `load_hf_weights` or checkpoint paths explicitly for continued pretraining.
 - Start from the closest Megatron-Bridge recipe and override only required knobs.
 - Tune tensor, pipeline, context, and expert parallelism before scaling global batch.
@@ -28,7 +29,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 ## Config Nuances
 
 - Keep `recipe.seq_length`, `model.seq_length`, and `dataset.seq_length` identical; Bridge validates the model and dataset values before setup.
-- Set `dataset.data_paths` to the bin/idx `blend.json` from `prep/pretrain_prep`, not SFT packed Parquet.
+- Set `dataset.data_paths` to the bin/idx `blend.json` from `data_prep/pretrain_prep`, not SFT packed Parquet.
 - For Qwen/Nemotron MoE runs, keep `model.sequence_parallel: true` with tensor parallelism.
 - If Transformer Engine userbuffers are enabled on a system without CUDA multicast support, set `run.env.env_vars.UB_SKIPMC: "1"` or default it in `step.py` before Bridge initialization.
 - Use `train.global_batch_size` as a multiple of data-parallel size; start with `train.micro_batch_size: 1` when validating a new parallelism shape.
@@ -42,6 +43,6 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Guardrails
 
-- Run `prep/pretrain_prep` first unless compatible bin/idx data already exists.
+- Run `data_prep/pretrain_prep` first unless compatible bin/idx data already exists.
 - Verify data paths and checkpoint writes on the target executor before long jobs.
 - Convert Megatron checkpoints only when the downstream consumer requires HF layout.
diff --git a/src/nemotron/steps/pretrain/megatron_bridge/config/tiny.yaml b/src/nemotron/steps/pretrain/megatron_bridge/config/tiny.yaml
index e68ac1b57..57c6e8317 100644
--- a/src/nemotron/steps/pretrain/megatron_bridge/config/tiny.yaml
+++ b/src/nemotron/steps/pretrain/megatron_bridge/config/tiny.yaml
@@ -38,7 +38,7 @@ model:
   recompute_num_layers: 1
 
 dataset:
-  data_paths: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/functional/prep/pretrain_idxbin/tiny/blend.json}
+  data_paths: ${oc.env:PRETRAIN_BLEND_PATH,/mnt/lustre-shared/output/functional/data_prep/pretrain_prep/tiny/blend.json}
   seq_length: 4096
 
 train:
diff --git a/src/nemotron/steps/pretrain/megatron_bridge/step.toml b/src/nemotron/steps/pretrain/megatron_bridge/step.toml
index 6d2c05d6b..4aad9438a 100644
--- a/src/nemotron/steps/pretrain/megatron_bridge/step.toml
+++ b/src/nemotron/steps/pretrain/megatron_bridge/step.toml
@@ -23,7 +23,7 @@ tags = ["pretrain", "cpt", "megatron", "distributed-training"]
 
 [[consumes]]
 type = "binidx"
-description = "Megatron bin/idx tokenized text shards with blend.json (from prep/pretrain_prep)."
+description = "Megatron bin/idx tokenized text shards with blend.json (from data_prep/pretrain_prep)."
 required = true
 
 [[consumes]]
@@ -41,11 +41,24 @@ description = "Training sequence length."
 default = 8192
 choices = [2048, 4096, 8192, 16384]
 
+[[parameters]]
+name = "dataset.data_paths"
+description = "Path to data_prep/pretrain_prep emitted blend.json. Do not use SFT packed Parquet here."
+
 [[parameters]]
 name = "load_hf_weights"
 description = "If true, initialise from HF base weights (CPT). If false, train from scratch."
 default = true
 
+[[parameters]]
+name = "train.micro_batch_size"
+description = "Per-rank micro batch size. Start at 1 while validating a new parallelism shape."
+default = 1
+
+[[parameters]]
+name = "train.global_batch_size"
+description = "Global batch size. Keep it divisible by data-parallel size."
+
 [[strategies]]
 when = "Large token budget (> 100B)"
 then = "Increase pipeline parallelism and enable activation checkpointing to preserve memory."
@@ -57,11 +70,21 @@ then = "Lower LR (1e-5 - 5e-5) and shorter warmup; keep load_hf_weights=true."
 
 [[errors]]
 name = "missing_blend_json"
-recovery = "Run prep/pretrain_prep and point dataset.data_paths at the produced blend.json."
+recovery = "Run data_prep/pretrain_prep and point dataset.data_paths at the produced blend.json."
+
+[[errors]]
+name = "sequence_length_mismatch"
+recovery = "Keep recipe.seq_length, model.seq_length, dataset.seq_length, and pretrain token-budget assumptions identical."
+
+[[errors]]
+name = "transformer_engine_userbuffer_failure"
+recovery = "If CUDA multicast is unavailable, set UB_SKIPMC=1 in run.env.env_vars before Bridge initialization."
 
 [reference]
+skill = "src/nemotron/steps/pretrain/megatron_bridge/SKILL.md"
 repo = "https://github.com/NVIDIA-NeMo/Megatron-Bridge"
 readme = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/main/README.md"
 docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 training_entry_points = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/entry-points.html"
 recipe = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/tree/main/megatron/bridge/recipes/nemotronh"
+skills = ["skills/nemotron-customize/references/context/mbridge-pretrain.txt"]
diff --git a/src/nemotron/steps/rl/SKILL.md b/src/nemotron/steps/rl/SKILL.md
index fe5946bed..9f92802bf 100644
--- a/src/nemotron/steps/rl/SKILL.md
+++ b/src/nemotron/steps/rl/SKILL.md
@@ -37,15 +37,15 @@ resource-server / GenRM rewards (`env.should_use_nemo_gym=true`).
    not launch fails. See
    [../patterns/rl-validate-rewards-before-scale.md](../patterns/rl-validate-rewards-before-scale.md).
 3. **Materialized data**. If data starts as HF references, run
-   [`prep/rl_prep`](../prep/rl_prep/SKILL.md) first to resolve placeholders
+   [`data_prep/rl_prep`](../data_prep/rl_prep/SKILL.md) first to resolve placeholders
    into local JSONL.
 
 ## Pipeline placement
 
 ```
-... → sft/megatron_bridge → prep/rl_prep → rl/nemo_rl/dpo   → checkpoint_megatron
-                                          → rl/nemo_rl/rlvr  → checkpoint_megatron
-                                          → rl/nemo_rl/rlhf  → checkpoint_megatron
+... → sft/megatron_bridge → data_prep/rl_prep → rl/nemo_rl/dpo   → checkpoint_megatron
+                                              → rl/nemo_rl/rlvr  → checkpoint_megatron
+                                              → rl/nemo_rl/rlhf  → checkpoint_megatron
 ```
 
 Output is Megatron-format. Add [`convert/megatron_to_hf`](../convert/megatron_to_hf/step.toml)
@@ -53,17 +53,18 @@ when the next consumer (eval, deployment) expects HF.
 
 ## Workflow
 
-1. **Env profile first** — verify the env profile for Lepton/Slurm/Ray runs
-   (`env.toml` by default, or `NEMOTRON_ENV_FILE` for backend-specific files).
-2. Confirm the SFT warm-start checkpoint exists and was trained on a
+1. Confirm the SFT warm-start checkpoint exists and was trained on a
    compatible tokenizer and chat template.
-3. Run [`prep/rl_prep`](../prep/rl_prep/SKILL.md) when data needs HF
+2. Run [`data_prep/rl_prep`](../data_prep/rl_prep/SKILL.md) when data needs HF
    resolution or sharding.
-4. Pick the step per the decision tree.
-5. Validate the reward path on a tiny set **before scaling rollout count** —
+3. Pick the step per the decision tree.
+4. Validate the reward path on a tiny set **before scaling rollout count** —
    see the rewards pattern above.
-6. Use `config/tiny.yaml` for runner validation; method-specific configs
+5. Use `config/tiny.yaml` for runner validation; method-specific configs
    (`config/nemo_gym.yaml` for resource-server rewards) for production.
+6. For remote submission, select the profile from
+   `env/env_toml/config/{lepton,slurm,dgxcloud}.yaml` or the generated env file;
+   do not hardcode profile names here.
 7. Track KL, reward variance, reward saturation, response length, and
    held-out task evals — not just reward.
 8. Bookend with eval — see
@@ -74,11 +75,32 @@ when the next consumer (eval, deployment) expects HF.
 ## Smoke commands
 
 ```bash
-nemotron step run rl/nemo_rl/dpo  -c tiny
-nemotron step run rl/nemo_rl/rlvr -c tiny
-nemotron step run rl/nemo_rl/rlhf -c tiny
+uv run nemotron steps run rl/nemo_rl/dpo  -c tiny --dry-run
+uv run nemotron steps run rl/nemo_rl/rlvr -c tiny --dry-run
+uv run nemotron steps run rl/nemo_rl/rlhf -c tiny --dry-run
 ```
 
+## Project layout for generated configs
+
+Keep every generated overlay config and any supporting code under a single
+self-contained project root that also holds the local input data, so the
+whole directory is rsync/scp-portable to the remote machine that will run
+the alignment step.
+
+- `<project>/config/` for generated YAML — never write into
+  `src/nemotron/steps/rl/nemo_rl/<algo>/config/`; the shipped
+  `default.yaml`, `tiny.yaml`, and `nemo_gym.yaml` stay as catalog
+  references.
+- `<project>/data/` for sharded preference / prompt / verifier JSONL
+  produced by `data_prep/rl_prep`.
+- Keep the SFT warm-start `checkpoint_megatron` path, RLHF reward-model
+  `checkpoint_hf` path, and any NeMo-Gym resource-server configs resolvable
+  under the same project root so the whole alignment job ships in one
+  bundle.
+- Project-root scripts only when catalog code cannot serve the request.
+- Do not split generated files into home dirs, scratch dirs, or paths
+  outside the project root that will not ship with the bundle.
+
 ## Guardrails
 
 - Never trust reward gain alone. Score on a held-out task eval before
diff --git a/src/nemotron/steps/rl/nemo_rl/SKILL.md b/src/nemotron/steps/rl/nemo_rl/SKILL.md
index c3f0043f7..50248016f 100644
--- a/src/nemotron/steps/rl/nemo_rl/SKILL.md
+++ b/src/nemotron/steps/rl/nemo_rl/SKILL.md
@@ -37,7 +37,7 @@ The local `defaults: <yaml>` form in YAML is a small layering convenience
 ## Workflow
 
 1. Read the algorithm's `step.toml` for consumed artifacts and main knobs.
-2. Run [`../prep/rl_prep`](../../prep/rl_prep/SKILL.md) when data starts as
+2. Run [`../data_prep/rl_prep`](../../data_prep/rl_prep/SKILL.md) when data starts as
    HF references or unsharded blends.
 3. Use `config/tiny.yaml` for runner validation. Use `config/nemo_gym.yaml`
    (RLVR/RLHF) when resource-server or GenRM rewards are required.
@@ -72,7 +72,7 @@ The local `defaults: <yaml>` form in YAML is a small layering convenience
 - [../../patterns/rl-validate-rewards-before-scale.md](../../patterns/rl-validate-rewards-before-scale.md) — validate every reward path before scaling.
 - [../../patterns/eval-before-and-after-training.md](../../patterns/eval-before-and-after-training.md) — RL must be scored on task evals, not just reward.
 - [../../patterns/byob-benchmark-design.md](../../patterns/byob-benchmark-design.md) — for sovereign deployments, the eval is the BYOB.
-- [../../patterns/prep-data-is-tokenizer-locked.md](../../patterns/prep-data-is-tokenizer-locked.md) — RL data sharded through `prep/rl_prep` inherits the tokenizer-lock invariant.
+- [../../patterns/prep-data-is-tokenizer-locked.md](../../patterns/prep-data-is-tokenizer-locked.md) — RL data sharded through `data_prep/rl_prep` inherits the tokenizer-lock invariant.
 
 ## Guardrails
 
diff --git a/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md b/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md
index 58ae297eb..f23235bb9 100644
--- a/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md
+++ b/src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md
@@ -14,13 +14,15 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - Consume `training_jsonl` with prompt, chosen, and rejected fields.
 - Consume an SFT `checkpoint_megatron` policy.
 - Produce a DPO-aligned `checkpoint_megatron`.
-- Smoke with `nemotron step run rl/nemo_rl/dpo -c tiny`.
+- Smoke with `nemotron steps run rl/nemo_rl/dpo -c tiny`.
 
 ## Configure
 
 - Tune `dpo.reference_policy_kl_penalty` when KL collapses or loss diverges.
+- Set train and validation preference paths explicitly in the active NeMo-RL
+  data schema.
 - Lower learning rate before making structural changes to the runner.
-- Use `prep/rl_prep` when preference data starts as HF references or blended local files.
+- Use `data_prep/rl_prep` when preference data starts as HF references or blended local files.
 - Keep the reference policy aligned with the SFT policy.
 - Check `src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md` before trusting preference-pair training results.
 
@@ -40,5 +42,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 ## Guardrails
 
 - Validate chosen and rejected ordering; inverted pairs silently teach the wrong behavior.
+- Keep `policy.train_global_batch_size` divisible by the active policy worker
+  shape and micro batch size.
 - Keep train and validation preference distributions comparable.
 - Inspect examples where the model regresses after DPO; preference data can encode style bias.
diff --git a/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml b/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml
index 9e2224d94..12c556424 100644
--- a/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml
+++ b/src/nemotron/steps/rl/nemo_rl/dpo/config/tiny.yaml
@@ -23,7 +23,7 @@
 #   - dataset = a HF preference set via BinaryPreferenceDataset.
 #
 # Usage:
-#   nemotron step run rl/nemo_rl/dpo -c tiny -r lepton_rl
+#   nemotron steps run rl/nemo_rl/dpo -c tiny -r lepton_rl
 
 dpo:
   max_num_epochs: 1
diff --git a/src/nemotron/steps/rl/nemo_rl/dpo/step.toml b/src/nemotron/steps/rl/nemo_rl/dpo/step.toml
index b7242cb2e..2b8131399 100644
--- a/src/nemotron/steps/rl/nemo_rl/dpo/step.toml
+++ b/src/nemotron/steps/rl/nemo_rl/dpo/step.toml
@@ -40,10 +40,43 @@ name = "dpo.reference_policy_kl_penalty"
 description = "KL penalty against the reference policy (β in DPO)."
 default = 0.05
 
+[[parameters]]
+name = "data.train.prompt_file"
+description = "Training preference JSONL or prompt file path, depending on the active NeMo-RL dataset schema."
+
+[[parameters]]
+name = "data.validation.prompt_file"
+description = "Validation preference JSONL or prompt file path. Keep validation cadence explicit when changing run length."
+
+[[parameters]]
+name = "policy.train_global_batch_size"
+description = "Global policy batch size. Must divide cleanly across active policy workers and micro batch size."
+
 [[strategies]]
 when = "Loss diverges or KL collapses"
 then = "Increase reference_policy_kl_penalty (0.1-0.3) or lower the learning rate."
 
+[[strategies]]
+when = "Preference data starts as HF references or blended local files"
+then = "Run data_prep/rl_prep first so DPO consumes local, sharded prompt/chosen/rejected JSONL."
+
+[[strategies]]
+when = "Changing DPO run length"
+then = "Set dpo.val_at_start, dpo.val_period, and dpo.val_at_end explicitly; do not rely on upstream defaults."
+
+[[errors]]
+name = "inverted_preference_pairs"
+recovery = "Spot-check prompt/chosen/rejected ordering before training. Inverted pairs silently teach the wrong behavior."
+
+[[errors]]
+name = "batch_shape_not_divisible"
+recovery = "Make policy.train_global_batch_size divisible by policy worker shape and train_micro_batch_size."
+
+[[errors]]
+name = "reference_policy_mismatch"
+recovery = "Keep the reference policy aligned with the SFT policy checkpoint, tokenizer, and chat template."
+
 [reference]
+skill = "src/nemotron/steps/rl/nemo_rl/dpo/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/RL/blob/main/examples/run_dpo.py"
-skills = ["skills/nemotron-customize/context/nemo-rl-alignment.txt"]
+skills = ["skills/nemotron-customize/references/context/nemo-rl-alignment.txt"]
diff --git a/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md b/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md
index 3d132a34f..a7bced0a6 100644
--- a/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md
+++ b/src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md
@@ -15,14 +15,16 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - Consume an SFT `checkpoint_megatron` policy.
 - Consume a reward-model `checkpoint_hf`.
 - Produce an RLHF-aligned `checkpoint_megatron`.
-- Smoke with `nemotron step run rl/nemo_rl/rlhf -c tiny`.
+- Smoke with `nemotron steps run rl/nemo_rl/rlhf -c tiny`.
 
 ## Configure
 
 - Set `env.nemo_gym.genrm_model.responses_api_models.vllm_model.model` to the reward-model path.
+- Keep `env.should_use_nemo_gym=true` for GenRM comparison rewards.
+- Set `data.train.data_path` and `data.validation.data_path` to prompt JSONL
+  normalized for the NeMo-Gym Responses API path.
 - Tune `grpo.num_generations_per_prompt` based on reward variance and serving cost.
 - Increase KL penalty, lower learning rate, or clip rewards when reward hacking appears.
-- Keep `env.should_use_nemo_gym=true` for GenRM-style comparison rewards.
 - Check `src/nemotron/steps/patterns/rl-validate-rewards-before-scale.md` before changing RLHF reward or rollout behavior.
 
 ## Config Nuances
@@ -44,4 +46,6 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Validate reward-model serving separately before launching policy optimization.
 - Keep policy, reference, and reward-model checkpoints clearly separated in config.
+- Do not use train data as validation unless the run is explicitly marked
+  non-evaluative.
 - Review held-out reward examples to detect judge bias or reward saturation.
diff --git a/src/nemotron/steps/rl/nemo_rl/rlhf/config/tiny.yaml b/src/nemotron/steps/rl/nemo_rl/rlhf/config/tiny.yaml
index 486891927..aaa2c0608 100644
--- a/src/nemotron/steps/rl/nemo_rl/rlhf/config/tiny.yaml
+++ b/src/nemotron/steps/rl/nemo_rl/rlhf/config/tiny.yaml
@@ -63,7 +63,7 @@ policy:
         num_nodes: 1
 
 data:
-  manifest_path: ${oc.env:RL_PREP_MANIFEST,/mnt/lustre-shared/output/functional/prep/rl/tiny/manifest.json}
+  manifest_path: ${oc.env:RL_PREP_MANIFEST,/mnt/lustre-shared/output/functional/data_prep/rl_prep/tiny/manifest.json}
   allow_train_as_validation: true
 
 env:
diff --git a/src/nemotron/steps/rl/nemo_rl/rlhf/step.toml b/src/nemotron/steps/rl/nemo_rl/rlhf/step.toml
index 75610f4f2..2cb3c0bbb 100644
--- a/src/nemotron/steps/rl/nemo_rl/rlhf/step.toml
+++ b/src/nemotron/steps/rl/nemo_rl/rlhf/step.toml
@@ -45,11 +45,28 @@ name = "grpo.num_generations_per_prompt"
 description = "Rollouts per prompt; GRPO group size."
 default = 8
 
+[[parameters]]
+name = "env.should_use_nemo_gym"
+description = "Keep true for GenRM-style comparison rewards served through NeMo-Gym."
+default = true
+
 [[parameters]]
 name = "env.nemo_gym.genrm_model.responses_api_models.vllm_model.model"
 description = "HF path or local path for the GenRM judge model served by NeMo-Gym."
 default = "/path/to/genrm/model"
 
+[[parameters]]
+name = "data.train.data_path"
+description = "Training prompt JSONL normalized for the NeMo-Gym Responses API path."
+
+[[parameters]]
+name = "data.validation.data_path"
+description = "Validation prompt JSONL. Avoid train-as-validation unless the run is explicitly non-evaluative."
+
+[[parameters]]
+name = "env.nemo_gym.genrm_model.vllm"
+description = "GenRM serving resources and limits such as tensor_parallel_size, max_num_seqs, max_model_len, and gpu_memory_utilization."
+
 [[strategies]]
 when = "Reward model saturates / reward hacking is observed"
 then = "Increase KL penalty, lower learning rate, or add reward clipping."
@@ -58,6 +75,27 @@ then = "Increase KL penalty, lower learning rate, or add reward clipping."
 when = "You have Super3-style RLHF data"
 then = "Keep env.should_use_nemo_gym=true and point data.train.data_path / data.validation.data_path at the prepared NeMo-Gym JSONL."
 
+[[strategies]]
+when = "Sizing a GenRM run"
+then = "Budget separate placement for policy/generation, NeMo-Gym GPU servers, and Ray headroom; validate GenRM readiness before policy optimization."
+
+[[errors]]
+name = "genrm_server_not_ready"
+recovery = "Tune GenRM vLLM tensor parallelism, max_num_seqs, max_model_len, and gpu_memory_utilization, then validate serving before launching policy optimization."
+
+[[errors]]
+name = "responses_api_schema_missing"
+recovery = "Normalize data rows into the Responses API shape expected by NeMo-Gym, including responses_create_params where required."
+
+[[errors]]
+name = "reward_model_policy_confused"
+recovery = "Keep policy checkpoint, reference policy, and reward-model checkpoint paths separate in config and artifact tracking."
+
+[[errors]]
+name = "train_used_as_validation_unintentionally"
+recovery = "Provide a real validation path unless the run is explicitly non-evaluative and documented as such."
+
 [reference]
+skill = "src/nemotron/steps/rl/nemo_rl/rlhf/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/Nemotron/tree/main/src/nemotron/recipes/super3/stage2_rl/stage3_rlhf"
-skills = ["skills/nemotron-customize/context/nemo-rl-alignment.txt"]
+skills = ["skills/nemotron-customize/references/context/nemo-rl-alignment.txt"]
diff --git a/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md b/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md
index 1178e1794..eb09cc9c6 100644
--- a/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md
+++ b/src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md
@@ -14,11 +14,13 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - Consume prompt `training_jsonl` with verifier fields such as answers.
 - Consume an SFT `checkpoint_megatron` policy.
 - Produce an RLVR-aligned `checkpoint_megatron`.
-- Smoke with `nemotron step run rl/nemo_rl/rlvr -c tiny`.
+- Smoke with `nemotron steps run rl/nemo_rl/rlvr -c tiny`.
 
 ## Configure
 
 - Increase `grpo.num_generations_per_prompt` when reward variance is too low.
+- Size `grpo.num_prompts_per_step`, `grpo.num_generations_per_prompt`, and
+  policy batch sizes for the active Ray worker topology.
 - Keep `grpo.normalize_rewards=true` unless debugging raw reward scale.
 - Use `config/nemo_gym.yaml` for resource-server rewards.
 - Set `data.train.data_path`, `data.validation.data_path`, and `env.nemo_gym.config_paths` explicitly for NeMo-Gym.
@@ -43,5 +45,7 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 ## Guardrails
 
 - Validate reward functions on sample rollouts before training.
+- Do not mix NeMo-Gym resource-server config with the upstream generic GRPO
+  data schema.
 - Keep reward outputs bounded and deterministic when possible.
 - Avoid ambiguous reward fields; schema drift tends to surface as poor learning rather than clear failures.
diff --git a/src/nemotron/steps/rl/nemo_rl/rlvr/step.toml b/src/nemotron/steps/rl/nemo_rl/rlvr/step.toml
index 205fc2bf3..c4a18fd85 100644
--- a/src/nemotron/steps/rl/nemo_rl/rlvr/step.toml
+++ b/src/nemotron/steps/rl/nemo_rl/rlvr/step.toml
@@ -41,6 +41,10 @@ name = "grpo.num_generations_per_prompt"
 description = "Rollouts per prompt; the GRPO group size."
 default = 8
 
+[[parameters]]
+name = "grpo.num_prompts_per_step"
+description = "Prompt batch per GRPO step. The rollout batch is num_prompts_per_step * num_generations_per_prompt."
+
 [[parameters]]
 name = "grpo.normalize_rewards"
 description = "Normalise rewards within the group before computing advantages."
@@ -51,6 +55,18 @@ name = "env.should_use_nemo_gym"
 description = "Switch from the upstream generic GRPO example to the explicit NeMo-Gym GRPO runner."
 default = false
 
+[[parameters]]
+name = "data.train.data_path"
+description = "Training prompt JSONL with verifier fields for the active reward path."
+
+[[parameters]]
+name = "data.validation.data_path"
+description = "Validation prompt JSONL with verifier fields. Required when validation is scheduled."
+
+[[parameters]]
+name = "policy.logprob_batch_size"
+description = "Per-worker logprob microbatch after rollout data is sharded across policy workers."
+
 [[strategies]]
 when = "Reward variance is low"
 then = "Raise num_generations_per_prompt and use leave-one-out baselines."
@@ -59,6 +75,27 @@ then = "Raise num_generations_per_prompt and use leave-one-out baselines."
 when = "You have Super3-style JSONL or resource-server rewards"
 then = "Start from config/nemo_gym.yaml and set data.train.data_path, data.validation.data_path, and env.nemo_gym.config_paths."
 
+[[strategies]]
+when = "Validation is enabled"
+then = "Keep grpo.max_val_samples and grpo.val_batch_size numeric or deliberately null together so validation math is explicit."
+
+[[errors]]
+name = "missing_verifier_fields"
+recovery = "Ensure each training and validation record includes the answer, tests, env metadata, or resource-server fields expected by the reward function."
+
+[[errors]]
+name = "rollout_batch_not_divisible"
+recovery = "Make grpo.num_prompts_per_step * grpo.num_generations_per_prompt and policy batch sizes divisible by the active policy shard count."
+
+[[errors]]
+name = "mixed_nemo_gym_and_upstream_config"
+recovery = "Use env.should_use_nemo_gym=true only with NeMo-Gym/resource-server configs; otherwise keep the upstream GRPO data schema."
+
+[[errors]]
+name = "reward_function_not_validated"
+recovery = "Run the verifier or resource-server reward path on sample rollouts before increasing rollout count or GPUs."
+
 [reference]
+skill = "src/nemotron/steps/rl/nemo_rl/rlvr/SKILL.md"
 script = "https://github.com/NVIDIA-NeMo/RL/blob/main/examples/run_grpo.py"
-skills = ["skills/nemotron-customize/context/nemo-rl-alignment.txt"]
+skills = ["skills/nemotron-customize/references/context/nemo-rl-alignment.txt"]
diff --git a/src/nemotron/steps/rl/nemo_rl_grpo/step.toml b/src/nemotron/steps/rl/nemo_rl_grpo/step.toml
deleted file mode 100644
index 4f0605ac4..000000000
--- a/src/nemotron/steps/rl/nemo_rl_grpo/step.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[step]
-id = "rl/nemo_rl_grpo"
-name = "RL Alignment (NeMo-RL / GRPO)"
-category = "rl"
-status = "planned"
-description = "Planned: align an SFT-trained Megatron checkpoint with GRPO using NeMo-RL."
-tags = ["planned", "rl", "grpo", "nemo-rl"]
-
-[[consumes]]
-type = "training_jsonl"
-description = "Prompt or rollout JSONL used for RL sampling."
-
-[[consumes]]
-type = "checkpoint_megatron"
-description = "SFT-trained Megatron checkpoint to align."
-
-[[produces]]
-type = "checkpoint_megatron"
-description = "GRPO-aligned Megatron checkpoint."
diff --git a/src/nemotron/steps/sdg/SKILL.md b/src/nemotron/steps/sdg/SKILL.md
index 7352daf32..ebf871437 100644
--- a/src/nemotron/steps/sdg/SKILL.md
+++ b/src/nemotron/steps/sdg/SKILL.md
@@ -35,10 +35,10 @@ The catalog ships one step under this category:
 ## Pipeline placement
 
 ```
-sdg/data_designer (default.yaml)              → prep/sft_packing → sft/megatron_bridge
-sdg/data_designer (default.yaml)              →                    sft/automodel
-sdg/data_designer (customer_support_tools.yaml) → prep/sft_packing → sft/* (tool-call SFT)
-sdg/data_designer (rl_pref.yaml)              → prep/rl_prep      → rl/nemo_rl/dpo
+sdg/data_designer (default.yaml)              → data_prep/sft_packing → sft/megatron_bridge
+sdg/data_designer (default.yaml)              →                         sft/automodel
+sdg/data_designer (customer_support_tools.yaml) → data_prep/sft_packing → sft/* (tool-call SFT)
+sdg/data_designer (rl_pref.yaml)              → data_prep/rl_prep       → rl/nemo_rl/dpo
 ```
 
 ## Workflow
@@ -50,9 +50,9 @@ sdg/data_designer (rl_pref.yaml)              → prep/rl_prep      → rl/nemo_
    produce the right shape.
 4. Set `num_records` only after preview is right.
 5. Project the output explicitly to the schema the next stage expects:
-   - SFT (Megatron-Bridge): `openai_messages` → `prep/sft_packing`.
+   - SFT (Megatron-Bridge): `openai_messages` → `data_prep/sft_packing`.
    - SFT (AutoModel): `openai_messages` directly (no packing).
-   - DPO: `dpo_preference` → `prep/rl_prep`.
+   - DPO: `dpo_preference` → `data_prep/rl_prep`.
 6. Validate generated records before training — see
    [../patterns/sdg-pipeline-versioning.md](../patterns/sdg-pipeline-versioning.md).
 7. For sovereign deployments, mix synthetic with non-synthetic data and
@@ -63,8 +63,8 @@ sdg/data_designer (rl_pref.yaml)              → prep/rl_prep      → rl/nemo_
 ## Smoke commands
 
 ```bash
-nemotron step run sdg/data_designer -c tiny
-nemotron step run sdg/data_designer -c default --extra-args=--preview
+nemotron steps run sdg/data_designer -c tiny
+nemotron steps run sdg/data_designer -c default --extra-args=--preview
 ```
 
 ## Patterns to cite
diff --git a/src/nemotron/steps/sdg/data_designer/SKILL.md b/src/nemotron/steps/sdg/data_designer/SKILL.md
index 90be93d10..1985dafc1 100644
--- a/src/nemotron/steps/sdg/data_designer/SKILL.md
+++ b/src/nemotron/steps/sdg/data_designer/SKILL.md
@@ -14,11 +14,20 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - SFT SDG: use `config/default.yaml` or `config/customer_support_tools.yaml`.
 - RL preference SDG: use `config/rl_pref.yaml` for chosen and rejected preference pairs.
 - Tiny validation: use `config/tiny.yaml` or preview mode while editing columns.
+- Custom endpoint example: see the commented `providers:` block in
+  `config/customer_support_tools.yaml`.
 
 ## Configure
 
 - Set `num_records` to the target generated count only after preview output looks correct.
 - Set `seed_dataset.path` for seed-typed columns.
+- Keep `columns` references valid and preview them before scaling.
+- Set `output_projection.type` to the downstream schema:
+  `openai_messages`, `structured_messages`, or `dpo_preference`.
+- For custom inference endpoints, add `providers:` and point each
+  `models[].provider` at a declared provider name.
+- In `providers[].api_key`, write the environment variable name such as
+  `OPENAI_API_KEY`; do not resolve the secret into YAML with `${oc.env:...}`.
 - Add post-processing or projection columns so downstream steps receive the expected schema.
 - Use SFT output with AutoModel directly only after it is projected to chat `messages`.
 - Use preference output with `rl/nemo_rl/dpo` only after prompt, chosen, and rejected fields are present.
@@ -35,4 +44,6 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 - Keep generated schema explicit in YAML; avoid hidden assumptions in `step.py`.
 - Inspect a sample of generated records before running prep or training.
+- Do not put resolved API keys in YAML; provider `api_key` values are env-var
+  names.
 - Version prompts, seed data, model aliases, inference parameters, and projections.
diff --git a/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml b/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
index 41809096a..915ec453e 100644
--- a/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
+++ b/src/nemotron/steps/sdg/data_designer/config/customer_support_tools.yaml
@@ -29,6 +29,30 @@ seed_dataset:
   strategy: shuffle
   fields: [customer_name, issue, order_id, product, policy_hint]
 
+# Optional custom endpoint example:
+#
+# To route this pipeline through an OpenAI-compatible endpoint instead of the
+# built-in NVIDIA provider, uncomment and edit both blocks below.
+# Keep providers[].api_key as the environment variable name. Data Designer
+# resolves it at request time; using `${oc.env:OPENAI_API_KEY}` here would put
+# the secret into the resolved config.
+#
+# providers:
+#   - name: my-provider
+#     endpoint: ${oc.env:OPENAI_BASE_URL}
+#     provider_type: openai
+#     api_key: OPENAI_API_KEY
+#
+# models:
+#   - alias: nvidia-text
+#     model: google/gemma-4-31B-it
+#     provider: my-provider
+#     skip_health_check: true
+#     inference_parameters:
+#       temperature: 0.75
+#       top_p: 0.95
+#       max_tokens: 1800
+
 models:
   - alias: nvidia-text
     model: openai/gpt-oss-20b
diff --git a/src/nemotron/steps/sdg/data_designer/step.py b/src/nemotron/steps/sdg/data_designer/step.py
index 4e1db779b..e172d087c 100644
--- a/src/nemotron/steps/sdg/data_designer/step.py
+++ b/src/nemotron/steps/sdg/data_designer/step.py
@@ -292,6 +292,31 @@ def records_from_designer_result(result: Any) -> list[dict[str, Any]]:
     raise TypeError(f"Unsupported Data Designer dataset type: {type(dataset).__name__}")
 
 
+def build_model_providers(cfg: dict[str, Any], dd: Any) -> list[Any] | None:
+    """Build custom Data Designer model providers from optional YAML config."""
+    providers = cfg.get("providers") or []
+    if not providers:
+        return None
+    if not isinstance(providers, list):
+        raise ValueError("`providers:` must be a list when declared")
+
+    model_providers = []
+    for spec in providers:
+        if not isinstance(spec, dict):
+            raise ValueError("each `providers:` entry must be a mapping")
+        model_providers.append(
+            dd.ModelProvider(
+                name=spec["name"],
+                endpoint=spec["endpoint"],
+                provider_type=spec.get("provider_type", "openai"),
+                api_key=spec.get("api_key") or None,
+                extra_body=spec.get("extra_body"),
+                extra_headers=spec.get("extra_headers"),
+            )
+        )
+    return model_providers
+
+
 def main() -> None:
     config_path, cli_overrides = parse_config_and_overrides(default_config=DEFAULT_CONFIG)
     raw = apply_hydra_overrides(load_omegaconf_yaml(config_path), cli_overrides)
@@ -345,7 +370,7 @@ def main() -> None:
 
     build_columns(builder, columns, dd)
 
-    client = DataDesigner()
+    client = DataDesigner(model_providers=build_model_providers(cfg, dd))
 
     if cfg.get("preview", False):
         result = client.preview(builder, num_records=cfg["num_records"])
diff --git a/src/nemotron/steps/sdg/data_designer/step.toml b/src/nemotron/steps/sdg/data_designer/step.toml
index 87d036f7f..665a1a7b1 100644
--- a/src/nemotron/steps/sdg/data_designer/step.toml
+++ b/src/nemotron/steps/sdg/data_designer/step.toml
@@ -18,8 +18,9 @@ name = "Synthetic Data Generation (NeMo Data Designer)"
 category = "sdg"
 description = """\
 Build a NeMo Data Designer pipeline declaratively and generate synthetic data.
-Two recipes ship in config/: 'default' produces SFT chat data,
-'rl_pref' produces preference pairs (chosen / rejected) for DPO.
+Three recipes ship in config/: 'default' produces SFT chat data,
+'customer_support_tools' produces tool-call SFT data, and 'rl_pref' produces
+preference pairs (chosen / rejected) for DPO.
 
 Customisation lives in YAML — step.py just translates declarative column specs
 into the upstream DataDesignerConfigBuilder API."""
@@ -43,22 +44,64 @@ default = 1000
 name = "seed_dataset.path"
 description = "Path to seed JSONL referenced by 'seed'-typed columns."
 
+[[parameters]]
+name = "providers"
+description = "Optional custom Data Designer model providers for OpenAI-compatible or Anthropic endpoints. Provider api_key values should be environment variable names, not resolved secret values."
+
+[[parameters]]
+name = "models.provider"
+description = "Provider name for each model alias. Use a built-in provider such as 'nvidia', or a name declared under providers."
+
+[[parameters]]
+name = "columns"
+description = "Declarative Data Designer column specs. Prompts may reference seed and prior columns, so validate references in preview before scaling."
+
+[[parameters]]
+name = "output_projection.type"
+description = "Final output schema projection: openai_messages for SFT, structured_messages for tool-call SFT, or dpo_preference for DPO."
+choices = ["openai_messages", "structured_messages", "dpo_preference"]
+
 [[strategies]]
 when = "Iterating on column specs"
 then = "Run with --preview to emit a small batch via client.preview() before scaling to client.create()."
 
+[[strategies]]
+when = "Using a self-hosted or non-NVIDIA OpenAI-compatible endpoint"
+then = "Use the commented providers example in config/customer_support_tools.yaml and set OPENAI_BASE_URL plus OPENAI_API_KEY in the execution environment."
+
 [[strategies]]
 when = "Generating preference data for DPO"
 then = "Use config/rl_pref.yaml — it emits chosen / rejected fields ready for rl/nemo_rl/dpo."
 
 [[strategies]]
-when = "Output needs to feed prep/sft_packing"
+when = "Output needs to feed data_prep/sft_packing"
 then = "Add a final post-processing column or transform that projects the generated columns into OpenAI 'messages' chat format."
 
+[[strategies]]
+when = "Generating tool-call SFT data"
+then = "Use config/customer_support_tools.yaml and project messages plus tools with structured_messages so downstream SFT sees assistant tool_calls and matching tool responses."
+
+[[errors]]
+name = "provider_secret_resolved_into_yaml"
+recovery = "Set providers[].api_key to an environment variable name such as OPENAI_API_KEY; do not store resolved secret values in YAML."
+
+[[errors]]
+name = "unknown_model_provider"
+recovery = "Ensure every models[].provider is either a built-in provider or appears in providers[].name."
+
+[[errors]]
+name = "projection_schema_mismatch"
+recovery = "Set output_projection.type to the downstream schema: openai_messages for SFT, structured_messages for tool-call SFT, or dpo_preference for DPO."
+
+[[errors]]
+name = "unvalidated_generation_scaled"
+recovery = "Run preview or config/tiny.yaml and inspect generated records before raising num_records."
+
 [reference]
+skill = "src/nemotron/steps/sdg/data_designer/SKILL.md"
 repo = "https://github.com/NVIDIA-NeMo/DataDesigner"
 readme = "https://github.com/NVIDIA-NeMo/DataDesigner/blob/main/README.md"
 library_docs = "https://nvidia-nemo.github.io/DataDesigner/"
 nemo_platform_docs = "https://docs.nvidia.com/nemo/microservices/latest/data-designer/index.html"
 nemo_platform_quickstart = "https://docs.nvidia.com/nemo/microservices/latest/data-designer/quickstart.html"
-skills = ["skills/nemotron-customize/context/data-designer-sdg.txt"]
+skills = ["skills/nemotron-customize/references/context/data-designer-sdg.txt"]
diff --git a/src/nemotron/steps/sft/SKILL.md b/src/nemotron/steps/sft/SKILL.md
index 375d9ca69..d4cbb67a1 100644
--- a/src/nemotron/steps/sft/SKILL.md
+++ b/src/nemotron/steps/sft/SKILL.md
@@ -12,7 +12,7 @@ Pick an SFT backend and keep data and checkpoint formats compatible.
 | Backend | Best for | Min GPUs | Input | Output |
 |---|---|---|---|---|
 | [`sft/automodel`](automodel/SKILL.md) | HF-native outputs, direct JSONL, smaller GPU counts, quick LoRA experiments | 4 | `training_jsonl` (no packing) | `checkpoint_hf` |
-| [`sft/megatron_bridge`](megatron_bridge/SKILL.md) | Large distributed runs with TP/PP/CP, packed-sequence throughput, Nano3/Super3 recipe parity | 8 (Nano3), 32 (Super3) | `packed_parquet` (needs `prep/sft_packing`) | `checkpoint_megatron` |
+| [`sft/megatron_bridge`](megatron_bridge/SKILL.md) | Large distributed runs with TP/PP/CP, packed-sequence throughput, Nano3/Super3 recipe parity | 8 (Nano3), 32 (Super3) | `packed_parquet` (needs `data_prep/sft_packing`) | `checkpoint_megatron` |
 
 ## Decision tree
 
@@ -25,7 +25,7 @@ Pick an SFT backend and keep data and checkpoint formats compatible.
 ## Pipeline impact
 
 **If Megatron-Bridge:**
-- Add [`prep/sft_packing`](../prep/sft_packing/SKILL.md) upstream.
+- Add [`data_prep/sft_packing`](../data_prep/sft_packing/SKILL.md) upstream.
 - Output is `checkpoint_megatron`. For HF-format consumers downstream, add
   [`convert/megatron_to_hf`](../convert/megatron_to_hf/step.toml).
 
@@ -36,24 +36,40 @@ Pick an SFT backend and keep data and checkpoint formats compatible.
 
 ## Workflow
 
-1. **Env profile first** — for Lepton/Slurm/Ray runs verify the env profile.
-   Default lookup is repo-root `env.toml`; backend-specific files
-   (`env.lepton.toml`, `env.slurm.toml`) require `NEMOTRON_ENV_FILE`.
-2. Read the chosen step's `step.toml` for parameters/strategies/errors.
-3. Smoke-test with `config/tiny.yaml` before scaling.
-4. Keep `pack_size`, `seq_length`, tokenizer, and chat template identical
+1. Read the chosen step's `step.toml` for parameters/strategies/errors.
+2. Smoke-test with `config/tiny.yaml` before scaling.
+3. Keep `pack_size`, `seq_length`, tokenizer, and chat template identical
    across prep, train, eval, and deployment — see
    [../patterns/prep-data-is-tokenizer-locked.md](../patterns/prep-data-is-tokenizer-locked.md).
+4. For remote submission, select the profile from
+   `env/env_toml/config/{lepton,slurm,dgxcloud}.yaml` or the generated env file;
+   do not hardcode profile names here.
 5. Inspect formatted prompts and loss masks before treating a run as meaningful.
 6. Bookend with eval — see [../patterns/eval-before-and-after-training.md](../patterns/eval-before-and-after-training.md).
 
 ## Smoke commands
 
 ```bash
-nemotron step run sft/automodel -c tiny
-nemotron step run sft/megatron_bridge -c tiny   # requires compatible packed_parquet
+uv run nemotron steps run sft/automodel -c tiny --dry-run
+uv run nemotron steps run sft/megatron_bridge -c tiny --dry-run   # requires compatible packed_parquet
 ```
 
+## Project layout for generated configs
+
+Keep every generated overlay config and any supporting code under a single
+self-contained project root that also holds the local input data, so the
+whole directory is rsync/scp-portable to the remote machine that will run
+the SFT step.
+
+- `<project>/config/` for generated YAML — never write into
+  `src/nemotron/steps/sft/<backend>/config/`; the shipped `default.yaml`
+  and `tiny.yaml` stay as catalog references.
+- `<project>/data/` for local datasets, chat-format JSONL, and packed
+  Parquet splits referenced by the overlay.
+- Project-root scripts only when catalog code cannot serve the request.
+- Do not split generated files into home dirs, scratch dirs, or paths
+  outside the project root that will not ship with the bundle.
+
 ## Patterns to cite
 
 - [../patterns/prep-data-is-tokenizer-locked.md](../patterns/prep-data-is-tokenizer-locked.md) — tokenizer/template/seq_length must match prep.
@@ -70,7 +86,7 @@ nemotron step run sft/megatron_bridge -c tiny   # requires compatible packed_par
   problem (`convert/megatron_to_hf` / `convert/hf_to_megatron`).
 - For Megatron-Bridge, `pack_size` (in prep) must equal `seq_length` (in
   training). Mismatch surfaces as shape errors mid-train.
-- For AutoModel, never add `prep/sft_packing` — the trainer reads JSONL
+- For AutoModel, never add `data_prep/sft_packing` — the trainer reads JSONL
   directly.
 - Inspect formatted prompts and loss masks before trusting loss curves —
   template bugs look like quality bugs.
diff --git a/src/nemotron/steps/sft/automodel/SKILL.md b/src/nemotron/steps/sft/automodel/SKILL.md
index 4c100de44..680b48b79 100644
--- a/src/nemotron/steps/sft/automodel/SKILL.md
+++ b/src/nemotron/steps/sft/automodel/SKILL.md
@@ -17,6 +17,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Configure
 
+- Set `model.pretrained_model_name_or_path` to the HF base or checkpoint.
+- Set `dataset.path_or_dataset_id` to chat-format JSONL, not packed Parquet.
 - Default `sft/automodel` is full fine-tuning (`peft=null`); use `peft/automodel` or add a LoRA `peft:` block when adapter training is intended.
 - Keep `peft=lora` for memory-constrained runs or fast adapter experiments.
 - Choose a tokenizer with chat-template support or preprocess prompts explicitly.
@@ -38,6 +40,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Guardrails
 
-- Do not add `prep/sft_packing`; AutoModel reads JSONL directly.
+- Do not add `data_prep/sft_packing`; AutoModel reads JSONL directly.
+- Keep `dataloader.collate_fn` on the chat collater unless you intentionally
+  provide pre-tokenized data.
 - Reduce batch size or switch to LoRA before changing unrelated training logic for OOMs.
 - Inspect formatted conversations before trusting loss curves.
diff --git a/src/nemotron/steps/sft/automodel/step.toml b/src/nemotron/steps/sft/automodel/step.toml
index 1c61ab5b9..430ca2bde 100644
--- a/src/nemotron/steps/sft/automodel/step.toml
+++ b/src/nemotron/steps/sft/automodel/step.toml
@@ -47,12 +47,25 @@ name = "meta-llama/Llama-3.1-8B-Instruct"
 description = "Common HF baseline for single-node SFT and LoRA."
 min_gpus = 4
 
+[[parameters]]
+name = "model.pretrained_model_name_or_path"
+description = "HF base model or checkpoint used for full SFT or LoRA-style AutoModel tuning."
+
+[[parameters]]
+name = "dataset.path_or_dataset_id"
+description = "Chat-format JSONL path or dataset id. AutoModel reads training_jsonl directly; do not point this at packed Parquet."
+
 [[parameters]]
 name = "peft"
 description = "Use 'lora' for adapter tuning, or 'null' for full fine-tuning."
 default = "null"
 choices = ["lora", "null"]
 
+[[parameters]]
+name = "dataloader.collate_fn"
+description = "Chat dataset collater. Use the AutoModel default_collater unless the dataset was pre-tokenized deliberately."
+default = "nemo_automodel.components.datasets.utils.default_collater"
+
 [[strategies]]
 when = "1-2 GPUs or memory is tight"
 then = "prefer peft=lora and start with a Mistral-class model"
@@ -64,7 +77,7 @@ skill = "Automodel/docs/guides/llm/finetune.md"
 
 [[strategies]]
 when = "dataset already uses OpenAI chat-format JSONL"
-then = "skip prep/sft_packing and train directly from training_jsonl"
+then = "skip data_prep/sft_packing and train directly from training_jsonl"
 skill = "Automodel/docs/guides/dataset-overview.md"
 
 [[strategies]]
@@ -80,7 +93,12 @@ recovery = "use a tokenizer with chat-template support, or convert the data to p
 name = "oom"
 recovery = "switch to peft=lora, reduce batch size, or move to a smaller model"
 
+[[errors]]
+name = "packed_parquet_used_with_automodel"
+recovery = "Use training_jsonl directly. If the only data artifact is packed_parquet, go back to the source JSONL or choose sft/megatron_bridge."
+
 [reference]
+skill = "src/nemotron/steps/sft/automodel/SKILL.md"
 repo = "https://github.com/NVIDIA-NeMo/Automodel"
 readme = "https://github.com/NVIDIA-NeMo/Automodel/blob/main/README.md"
 docs = "https://docs.nvidia.com/nemo/automodel/latest/index.html"
diff --git a/src/nemotron/steps/sft/guide.md b/src/nemotron/steps/sft/guide.md
index 50df7dd18..3412bd288 100644
--- a/src/nemotron/steps/sft/guide.md
+++ b/src/nemotron/steps/sft/guide.md
@@ -2,7 +2,7 @@
 
 | Backend | Best for | Min GPUs | Data format | Checkpoint format |
 |---------|----------|----------|-------------|-------------------|
-| **Megatron-Bridge** | Large-scale distributed training with TP / PP / CP control | 8 | packed_parquet (needs `prep/sft_packing`) | checkpoint_megatron |
+| **Megatron-Bridge** | Large-scale distributed training with TP / PP / CP control | 8 | packed_parquet (needs `data_prep/sft_packing`) | checkpoint_megatron |
 | **AutoModel** | Simpler setup, fewer GPUs, LoRA / PEFT, quick iteration | 4 | training_jsonl (no packing) | checkpoint_hf |
 
 ## Quick decision tree
@@ -16,7 +16,7 @@
 ## Impact on the pipeline
 
 ### If you choose Megatron-Bridge
-- Add `prep/sft_packing` upstream.
+- Add `data_prep/sft_packing` upstream.
 - Input artifact becomes `packed_parquet`.
 - Output artifact is `checkpoint_megatron`.
 - If you later need HuggingFace format, add a conversion step.
diff --git a/src/nemotron/steps/sft/megatron_bridge/SKILL.md b/src/nemotron/steps/sft/megatron_bridge/SKILL.md
index 1c852b301..9bcde3d39 100644
--- a/src/nemotron/steps/sft/megatron_bridge/SKILL.md
+++ b/src/nemotron/steps/sft/megatron_bridge/SKILL.md
@@ -11,14 +11,17 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Inputs And Outputs
 
-- Consume `packed_parquet` from `prep/sft_packing`.
+- Consume `packed_parquet` from `data_prep/sft_packing`.
 - Optionally consume a `checkpoint_megatron` base or prior checkpoint.
 - Produce `checkpoint_megatron`.
 - Validate packed data, parallelism, and checkpoint output with a short run before scaling.
 
 ## Configure
 
-- Keep `seq_length` equal to the prep step's `pack_size`.
+- Keep `seq_length` equal to the data_prep step's `pack_size`.
+- Point `dataset.packed_sequence_specs.packed_train_data_path` at the packed
+  `splits/train/*.parquet` glob.
+- Keep base checkpoint paths separate from `checkpoint.save`.
 - Start Nano3 plans around the existing recipe defaults; scale Super3-like plans only after short validation runs pass.
 - Tune tensor, pipeline, and context parallelism before scaling global batch.
 - The shipped 30B default uses `peft=lora` to fit the starter topology; set `recipe.peft=null` and remove the top-level `peft:` block only when full SFT fits.
@@ -31,6 +34,8 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 - Keep `dataset.seq_length`, `dataset.packed_sequence_specs.packed_sequence_size`, and `model.seq_length` equal.
 - Use `model.sequence_parallel: true` for MoE plus tensor parallelism.
 - Start with `train.micro_batch_size: 1` when validating a new distributed shape and choose `train.global_batch_size` as a multiple of the resulting data-parallel size.
+- Inspect data_prep loss masks before trusting loss curves from a new template
+  or tool-call format.
 
 ## Local Files
 
@@ -41,6 +46,6 @@ Before changing configs or code, read `step.toml` to understand the step flow, c
 
 ## Guardrails
 
-- Run `prep/sft_packing` first unless a compatible packed dataset already exists.
+- Run `data_prep/sft_packing` first unless a compatible packed dataset already exists.
 - Repack data after tokenizer, template, or sequence length changes.
 - Convert Megatron checkpoints to HF format before HF-native evaluation or deployment.
diff --git a/src/nemotron/steps/sft/megatron_bridge/config/default.yaml b/src/nemotron/steps/sft/megatron_bridge/config/default.yaml
index 27cb6cab0..455e593c5 100644
--- a/src/nemotron/steps/sft/megatron_bridge/config/default.yaml
+++ b/src/nemotron/steps/sft/megatron_bridge/config/default.yaml
@@ -15,7 +15,7 @@
 # Tiny SFT for the 2-node slurm_sft profile.
 # Pulls base weights from HuggingFace via AutoBridge so no pre-existing
 # Megatron checkpoint is required. Uses the packed_parquet shards produced by
-# `nemotron step run prep/sft_packing -c tiny -r slurm_sft_dataprep_tiny`.
+# `nemotron steps run data_prep/sft_packing -c tiny -r slurm_prep_sft_packing`.
 #
 # Note: dataset paths are explicit (Megatron-Bridge's vanilla schema). This
 # step is generic — recipe-specific dir shortcuts live in recipes/, not here.
@@ -58,9 +58,9 @@ dataset:
   seq_length: 4096
   packed_sequence_specs:
     packed_sequence_size: 4096
-    packed_train_data_path: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/functional/prep/sft_packing/tiny/splits/train}
+    packed_train_data_path: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/functional/data_prep/sft_packing/tiny/splits/train}
     # No validation shards from sample=1000 run; uncomment when valid/ exists.
-    # packed_val_data_path: ${oc.env:SFT_PACKED_VAL_DIR,/mnt/lustre-shared/output/functional/prep/sft_packing/tiny/splits/valid}
+    # packed_val_data_path: ${oc.env:SFT_PACKED_VAL_DIR,/mnt/lustre-shared/output/functional/data_prep/sft_packing/tiny/splits/valid}
 
 train:
   train_iters: 10
diff --git a/src/nemotron/steps/sft/megatron_bridge/config/nano3.yaml b/src/nemotron/steps/sft/megatron_bridge/config/nano3.yaml
deleted file mode 100644
index 373b52d5d..000000000
--- a/src/nemotron/steps/sft/megatron_bridge/config/nano3.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# SFT defaults for Megatron-Bridge + Nano3
-# See full version: src/nemotron/recipes/nano3/stage1_sft/config/default.yaml
-
-recipe:
-  _target_: megatron.bridge.recipes.nemotronh.nemotron_3_nano.nemotron_3_nano_finetune_config
-  packed_sequence: true
-  peft: null
-
-dataset:
-  nano3_packed_sft_dir: /path/to/packed/data
-  seq_length: 4096
-  packed_sequence_specs:
-    packed_sequence_size: 4096
-
-train:
-  train_iters: 1700
-  global_batch_size: 4
-
-model:
-  seq_length: 4096
-  tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 1
-  context_parallel_size: 2
-
-checkpoint:
-  save: /nemo_run/sft-model
-  save_interval: 20
-  pretrained_checkpoint: /path/to/pretrained/checkpoint
-  finetune: true
diff --git a/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml b/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml
index 7eed9c173..dd8b509ef 100644
--- a/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml
+++ b/src/nemotron/steps/sft/megatron_bridge/config/tiny.yaml
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Tiny SFT for the 2-node lepton_sft_train profile.
+# Tiny SFT for the 2-node lepton_sft_megatron_bridge profile.
 # Points at lepton's lustre-shared mount by default.
 # Pulls base weights from HuggingFace via AutoBridge so no pre-existing
 # Megatron checkpoint is required. Uses the packed_parquet shards produced by
-# `nemotron step run prep/sft_packing -c tiny -r lepton_sft_dataprep`.
+# `nemotron steps run data_prep/sft_packing -c tiny -r lepton_prep_sft_packing`.
 
 # Override the container's stale Megatron-Bridge with a branch that supports
 # packed-parquet specs.
@@ -45,7 +45,7 @@ dataset:
     # Canonical splits/ layout produced by step.py's
     # realize_packed_shards_into_split_dirs() post-processing — sits at the
     # SFT_OUTPUT_DIR root (the pipeline manages sample-N internally).
-    packed_train_data_path: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/test/sft_dataprep/splits/train/*.parquet}
+    packed_train_data_path: ${oc.env:SFT_PACKED_DIR,/mnt/lustre-shared/output/functional/data_prep/sft_packing/tiny/splits/train/*.parquet}
 
 train:
   train_iters: 10
diff --git a/src/nemotron/steps/sft/megatron_bridge/step.py b/src/nemotron/steps/sft/megatron_bridge/step.py
index db63fa5ac..fbe826aa6 100644
--- a/src/nemotron/steps/sft/megatron_bridge/step.py
+++ b/src/nemotron/steps/sft/megatron_bridge/step.py
@@ -48,7 +48,7 @@
 
 from nemotron.steps._runners.megatron_bridge import run_megatron_bridge
 
-DEFAULT_CONFIG = Path(__file__).parent / "config" / "nano3.yaml"
+DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 DEFAULT_RECIPE = "megatron.bridge.recipes.nemotronh.nemotron_3_nano.nemotron_3_nano_finetune_config"
 
 
diff --git a/src/nemotron/steps/sft/megatron_bridge/step.toml b/src/nemotron/steps/sft/megatron_bridge/step.toml
index 5dad7bd38..5117e3968 100644
--- a/src/nemotron/steps/sft/megatron_bridge/step.toml
+++ b/src/nemotron/steps/sft/megatron_bridge/step.toml
@@ -5,7 +5,7 @@ category = "sft"
 description = """\
 Supervised fine-tuning using NVIDIA Megatron-Bridge. Best for large-scale
 distributed training with tensor/pipeline/context parallelism. Requires
-packed Parquet data from prep/sft_packing."""
+packed Parquet data from data_prep/sft_packing."""
 tags = ["sft", "fine-tuning", "megatron", "distributed-training"]
 
 [[consumes]]
@@ -35,20 +35,41 @@ min_gpus = 32
 
 [[parameters]]
 name = "seq_length"
-description = "Must match pack_size in prep/sft_packing"
+description = "Must match pack_size in data_prep/sft_packing"
 default = 4096
 choices = [2048, 4096, 8192, 16384, 32768]
 
+[[parameters]]
+name = "dataset.packed_sequence_specs.packed_train_data_path"
+description = "Packed training Parquet glob, usually data_prep/sft_packing output_dir/splits/train/*.parquet."
+
 [[parameters]]
 name = "peft"
 description = "'lora' = the shipped 30B default, 'null' = full SFT when the model fits"
 default = "lora"
 choices = ["null", "lora"]
 
+[[parameters]]
+name = "checkpoint.pretrained_checkpoint"
+description = "Optional Megatron base checkpoint or prior SFT checkpoint. Keep it distinct from the output checkpoint.save path."
+
+[[parameters]]
+name = "train.micro_batch_size"
+description = "Per-rank micro batch size. Start at 1 when validating a new distributed shape."
+default = 1
+
+[[parameters]]
+name = "train.global_batch_size"
+description = "Global batch size. Keep it divisible by the resulting data-parallel size."
+
 [[strategies]]
 when = "dataset < 10K records"
 then = "reduce global_batch_size and increase train_iters to keep optimizer steps useful"
 
+[[strategies]]
+when = "consuming packed Parquet"
+then = "Set recipe.packed_sequence=true and keep dataset.seq_length, dataset.packed_sequence_specs.packed_sequence_size, model.seq_length, and data_prep pack_size identical."
+
 [[strategies]]
 when = "user wants LoRA"
 then = "set peft=lora to reduce GPU requirements and checkpoint size"
@@ -76,7 +97,7 @@ skill = "Megatron-Bridge/skills/perf-techniques/sequence-packing/SKILL.md"
 
 [[errors]]
 name = "tokenizer_mismatch"
-recovery = "set prep/sft_packing tokenizer to match the training model"
+recovery = "set data_prep/sft_packing tokenizer to match the training model"
 
 [[errors]]
 name = "oom"
@@ -85,9 +106,18 @@ skill = "Megatron-Bridge/skills/perf-techniques/parallelism-strategies/SKILL.md"
 
 [[errors]]
 name = "missing_packed_data"
-recovery = "run prep/sft_packing first, or point dataset.nano3_packed_sft_dir at the packed splits directory"
+recovery = "run data_prep/sft_packing first, or point dataset.packed_sequence_specs.packed_train_data_path at the packed splits directory"
+
+[[errors]]
+name = "bad_parallel_batch_shape"
+recovery = "Set train.global_batch_size as a multiple of data-parallel size and start with train.micro_batch_size=1 while validating TP/PP/CP."
+
+[[errors]]
+name = "bad_loss_masks_or_template"
+recovery = "Inspect packed records from data_prep/sft_packing before training; prompt/template bugs often look like poor SFT quality."
 
 [reference]
+skill = "src/nemotron/steps/sft/megatron_bridge/SKILL.md"
 recipe = "src/nemotron/recipes/nano3/stage1_sft/"
 config = "src/nemotron/recipes/nano3/stage1_sft/config/default.yaml"
 script = "src/nemotron/recipes/nano3/stage1_sft/train.py"
@@ -96,4 +126,4 @@ megatron_bridge_readme = "https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/ma
 megatron_bridge_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/"
 training_entry_points = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/entry-points.html"
 packed_sequences_docs = "https://docs.nvidia.com/nemo/megatron-bridge/latest/training/packed-sequences.html"
-skills = ["skills/nemotron-customize/context/mbridge-sft.txt"]
+skills = ["skills/nemotron-customize/references/context/mbridge-sft.txt"]
diff --git a/src/nemotron/steps/synth/__init__.py b/src/nemotron/steps/synth/__init__.py
deleted file mode 100644
index 51eb13e90..000000000
--- a/src/nemotron/steps/synth/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Synthetic-data step category."""
diff --git a/src/nemotron/steps/synth/data_designer/step.toml b/src/nemotron/steps/synth/data_designer/step.toml
deleted file mode 100644
index c4e049876..000000000
--- a/src/nemotron/steps/synth/data_designer/step.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-[step]
-id = "synth/data_designer"
-name = "Synthetic Data Generation (Data Designer)"
-category = "synth"
-status = "planned"
-description = "Planned: generate synthetic conversation JSONL with Data Designer for downstream SFT."
-tags = ["planned", "synthetic-data", "data-designer", "jsonl"]
-
-[[consumes]]
-type = "training_jsonl"
-description = "Optional seed examples or schema references for synthetic generation."
-required = false
-
-[[produces]]
-type = "synthetic_jsonl"
-description = "Generated conversation JSONL in training-ready chat format."
diff --git a/src/nemotron/steps/translate/SKILL.md b/src/nemotron/steps/translate/SKILL.md
new file mode 100644
index 000000000..a5e807562
--- /dev/null
+++ b/src/nemotron/steps/translate/SKILL.md
@@ -0,0 +1,151 @@
+---
+name: nemotron-translate
+description: Translate JSONL or Parquet training corpora with NeMo Curator, including structured chat fields, hosted LLM, NMT, Google, AWS, optional FAITH scoring, and skip-already-translated input rows. Use when preparing multilingual data before data prep, SFT, CPT, or review.
+---
+
+# Nemotron Translation
+
+Use this skill when a user wants to translate corpus data, chat records, or row-oriented training artifacts. The concrete step is [`translate/nemo_curator`](nemo_curator/SKILL.md).
+
+## Default Workflow
+
+1. Install runtime dependencies with `uv sync --extra translate`.
+2. Read [`nemo_curator/step.toml`](nemo_curator/step.toml) for the step contract.
+3. Ask for `source_language`, `target_language`, input path, output path, backend, and field path. Do not infer source or target language silently.
+4. For downstream training data, start with `output_mode=replaced`, `merge_scores=false`, and `faith_eval.enabled=false`.
+5. For audit or quality review, use `output_mode=both` and enable `faith_eval`.
+6. Run a two-row smoke test before a large corpus.
+7. Validate row count, schema, translated field content, and that secrets were not printed.
+
+For one-shot translation requests, do not end in exploration mode. Provide the
+minimal runnable handoff first:
+
+- `Decision`: chosen step and scope, such as `translation-only` or `translation+FAITH`.
+- `Config`: key fields or config path bound to a concrete input path and format.
+- `Run`: exact command.
+- `Output`: expected output directory and artifacts.
+- `Env`: required environment variable names only, never values.
+
+Before finalizing, make these constraints explicit in prose even when the
+command implies them:
+
+- Input path and format selected for the run.
+- Incompatible inputs excluded, such as mixed JSONL and Parquet roots.
+- Observed language mismatch versus requested translation direction, with the
+  assumption used.
+- Exact model variant used, or the default model assumption if the user gave
+  only a model family.
+- Credential environment variable names used for auth.
+
+For `translation+FAITH`, add a short `FAITH handoff` section that confirms
+`faith_eval.enabled=true`, states any filter assumptions, lists expected
+FAITH-related outputs, and includes the exact run command and output path.
+
+## Backend Choice
+
+| Need | Backend | Notes |
+| --- | --- | --- |
+| Structured chat, tool calls, JSON, code, or high formatting fidelity | `llm` | Use OpenAI-compatible endpoint settings under `server`. |
+| Large plain-text corpus with local service | `nmt` | Service must expose `GET /health` and `POST /translate`. |
+| Managed provider translation | `google` or `aws` | Credentials must come from environment or provider config, not YAML secrets. |
+| Quality scoring for any backend | FAITH | FAITH still needs an LLM client even when translation backend is not `llm`. |
+
+## Common Commands
+
+Plain text JSONL through a hosted LLM:
+
+```bash
+uv run --no-sync nemotron steps run translate/nemo_curator \
+  input_path="$TR_ROOT/news_en" \
+  output_dir="$TR_ROOT/out_llm_hi" \
+  source_language=en \
+  target_language=hi \
+  backend=llm \
+  text_field=text \
+  output_mode=replaced \
+  merge_scores=false \
+  reconstruct_messages=false \
+  faith_eval.enabled=false \
+  server.url="$TRANSLATION_BASE_URL" \
+  server.model="$TRANSLATION_MODEL" \
+  server.api_key_env=NVIDIA_API_KEY
+```
+
+Structured chat records:
+
+```bash
+uv run --no-sync nemotron steps run translate/nemo_curator \
+  input_path="$TR_ROOT/chat_code_en.jsonl" \
+  output_dir="$TR_ROOT/out_chat_hi" \
+  source_language=en \
+  target_language=hi \
+  backend=llm \
+  text_field='messages.*.content' \
+  output_mode=replaced \
+  merge_scores=false \
+  reconstruct_messages=true \
+  faith_eval.enabled=false \
+  server.url="$TRANSLATION_BASE_URL" \
+  server.model="$TRANSLATION_MODEL" \
+  server.api_key_env=NVIDIA_API_KEY
+```
+
+NMT server:
+
+```bash
+uv run --no-sync nemotron steps run translate/nemo_curator \
+  input_path="$TR_ROOT/news_en" \
+  output_dir="$TR_ROOT/out_nmt_hi" \
+  source_language=en \
+  target_language=hi \
+  backend=nmt \
+  nmt.server_url="$NMT_SERVER_URL" \
+  text_field=text \
+  output_mode=replaced \
+  merge_scores=false \
+  reconstruct_messages=false \
+  faith_eval.enabled=false
+```
+
+## Patterns To Cite
+
+- [`../patterns/translate-training-corpus.md`](../patterns/translate-training-corpus.md) for inserting translation before prep or training.
+- [`../patterns/prefer-llm-for-structured-chat.md`](../patterns/prefer-llm-for-structured-chat.md) for chat, tool-call, JSON, and code-heavy data.
+- [`../patterns/prefer-nmt-for-large-corpora.md`](../patterns/prefer-nmt-for-large-corpora.md) for large plain-text translation.
+- [`../patterns/enable-faith-for-high-value-data.md`](../patterns/enable-faith-for-high-value-data.md) for quality annotation or filtering.
+- [`../patterns/multilingual-tokenizer-check.md`](../patterns/multilingual-tokenizer-check.md) before using translated data for SFT or CPT.
+
+## Guardrails
+
+- Do not build custom readers or writers first. Use Curator `JsonlReader` or `ParquetReader`, `TranslationStage`, and `JsonlWriter` or `ParquetWriter`.
+- Do not mix JSONL and Parquet in one input directory.
+- If the user provides a mixed-format root, require an explicit include/exclude decision before running.
+- Do not use `merge_scores=true` with `output_mode=replaced`; use `output_mode=both` if scores must be merged.
+- Do not treat `skip_translated=true` as output-directory resume. It only skips input rows that already contain a non-empty translation column.
+- Do not enable FAITH filtering without telling the user that rows may be dropped.
+- Keep API keys in environment variables such as `NVIDIA_API_KEY`, `NGC_API_KEY`, AWS credentials, or Google application credentials.
+- Never run environment-dump commands such as `env`, `printenv`, `set`, or broad `export` listings.
+- For diagnostics, mention only environment variable names and keep values redacted.
+
+## Troubleshooting
+
+- CLI mismatch or unexpected-argument errors: return to the documented command
+  shape in this file and confirm supported flags with `--help`; do not invent
+  alternate subcommands.
+- Missing translation dependencies: run `uv sync --extra translate` first.
+  If an eval/runtime environment still misses basics such as `toml` or
+  `pyyaml`, report the blocker and still provide the runnable handoff.
+- Mixed `.jsonl` and `.parquet` roots: bind `input_path` to one format only and
+  explicitly state excluded paths or formats.
+- Missing `translate/nemo_curator` metadata in a runtime workspace: treat it as
+  an environment/path issue, state the blocker, and provide the canonical
+  command for a complete checkout.
+- Path-not-found during validation: inspect actual created paths before
+  retrying; do not guess output roots.
+
+## Load More
+
+- [`guide.md`](guide.md) for detailed flow, output modes, FAITH, resume semantics, and validation.
+- [`nemo_curator/SKILL.md`](nemo_curator/SKILL.md) for the concrete step.
+- [`nemo_curator/config/default.yaml`](nemo_curator/config/default.yaml) for starter config.
+- [`nemo_curator/step.py`](nemo_curator/step.py) for the reader -> translation stage -> writer implementation.
diff --git a/src/nemotron/steps/translate/guide.md b/src/nemotron/steps/translate/guide.md
new file mode 100644
index 000000000..c49ff6524
--- /dev/null
+++ b/src/nemotron/steps/translate/guide.md
@@ -0,0 +1,225 @@
+# Translation Guide
+
+## Purpose
+
+`translate/nemo_curator` turns row-oriented corpus data into translated corpus data. It is intended for data that may later feed data prep, SFT, CPT, BYOB review, or human QA.
+
+The step is a thin Nemotron wrapper around NeMo Curator:
+
+```text
+input files
+  -> Curator JsonlReader or ParquetReader
+  -> Curator TranslationStage
+  -> Curator JsonlWriter or ParquetWriter
+output files
+```
+
+The translation sub-stages operate on Curator `DocumentBatch` objects in memory. Output files are created only by the final writer stage.
+
+## Installation
+
+Install translation dependencies before running locally:
+
+```bash
+uv sync --extra translate
+```
+
+If a QA environment needs BYOB and SDG in the same run:
+
+```bash
+uv sync --extra translate --extra byob --extra data-sdg --group run
+```
+
+For package-resource validation, confirm Curator prompt files are visible from the installed environment:
+
+```bash
+uv run --no-sync python - <<'PY'
+import importlib.resources as ir
+
+root = ir.files("nemo_curator.stages.text.experimental.translation.prompts")
+for name in ("translate.yaml", "faith_eval.yaml"):
+    path = root.joinpath(name)
+    assert path.is_file(), path
+    print(name, len(path.read_text()))
+PY
+```
+
+## Required Questions
+
+Ask these before running real translation:
+
+- What is the input path?
+- Should the output be JSONL or Parquet?
+- What is the source language code?
+- What is the target language code?
+- Which field should be translated?
+- Which backend should be used?
+- Should FAITH score rows, filter rows, or stay disabled?
+- Is the data plain text, structured chat, code-heavy, or tool-call data?
+
+Do not infer `source_language` or `target_language` silently. The starter config intentionally leaves them empty.
+
+## Field Selection
+
+Use `text_field=text` for simple row text:
+
+```json
+{"text": "The central bank raised rates."}
+```
+
+Use `text_field='messages.*.content'` for OpenAI-style chat records:
+
+```json
+{"messages": [{"role": "user", "content": "Translate this."}]}
+```
+
+For chat data, set `reconstruct_messages=true` when the user wants message structure preserved in the output.
+
+## Output Modes
+
+Use `output_mode=replaced` for downstream training. It replaces the selected source field with translated text and keeps the dataset simple.
+
+Use `output_mode=both` for audit and debugging. It keeps translated fields plus helper metadata and enables score merging.
+
+Use `output_mode=raw` only for debugging internals. It is not the usual user-facing output.
+
+Do not set `merge_scores=true` with `output_mode=replaced`. If score fields are required, use `output_mode=both`.
+
+## Backends
+
+### LLM
+
+Use `backend=llm` for hosted OpenAI-compatible translation, structured chat, code blocks, JSON-heavy rows, or tool-call data.
+
+Required config:
+
+```bash
+backend=llm
+server.url="$TRANSLATION_BASE_URL"
+server.model="$TRANSLATION_MODEL"
+server.api_key_env=NVIDIA_API_KEY
+```
+
+The API key should be in the named environment variable. Do not write secrets into checked-in YAML.
+
+### NMT
+
+Use `backend=nmt` for large plain-text corpora when a local or remote NMT service is available.
+
+The service contract is:
+
+```text
+GET  /health
+POST /translate
+```
+
+Request body:
+
+```json
+{"texts": ["Hello world"], "src_lang": "en", "tgt_lang": "hi"}
+```
+
+Response body:
+
+```json
+{"translations": ["नमस्ते दुनिया"]}
+```
+
+If using IndicTrans2, prefer the Hugging Face interface server when possible. Keep the public service API as `en` and `hi`, and map internally to IndicTrans language codes such as `eng_Latn` and `hin_Deva`.
+
+### Google And AWS
+
+Use `backend=google` or `backend=aws` only when provider credentials are configured in the runtime environment. Keep YAML limited to provider settings such as project, location, region, and concurrency.
+
+## FAITH
+
+FAITH is optional quality scoring after segment translation.
+
+Use it when translation quality needs evidence:
+
+```bash
+faith_eval.enabled=true
+faith_eval.filter_enabled=false
+faith_eval.model_name="$FAITH_MODEL"
+```
+
+Set `faith_eval.filter_enabled=true` only when the user explicitly wants low-quality rows dropped.
+
+FAITH uses an LLM client even when translation uses `nmt`, `google`, or `aws`, so the `server` block still needs a valid endpoint and API key.
+
+If the FAITH model returns an empty or invalid response, the run should fail clearly. Treat this as a model or implementation issue, not as successful translation.
+
+## Skip Already Translated Rows
+
+`skip_translated=true` checks input rows for a non-empty `translation_column`, usually `translated_text`.
+
+It is not output-directory resume.
+
+Correct use:
+
+```json
+{"text": "The central bank raised rates.", "translated_text": "केंद्रीय बैंक ने दरें बढ़ाईं।"}
+{"text": "Heavy rains caused flooding.", "translated_text": ""}
+```
+
+Expected behavior:
+
+- Rows with non-empty `translated_text` are not sent to the backend.
+- Rows with empty or missing `translated_text` are translated.
+- Skipped rows are restored before writing output.
+- The output directory is still overwritten by the writer.
+
+Do not test `skip_translated=true` by pointing input at a fully translated output directory unless the expected behavior is all rows skipped.
+
+## Input And Output Directories
+
+Output directory names in QA are descriptive only:
+
+| Directory | Meaning |
+| --- | --- |
+| `out_llm_hi` | LLM backend output translated to Hindi. |
+| `out_chat_hi` | Structured chat output translated to Hindi. |
+| `out_faith_annotated` | Translation with FAITH scores and no filtering. |
+| `out_nmt_hi` | NMT backend output translated to Hindi. |
+| `out_parquet_hi` | Parquet input and Parquet output. |
+| `out_resume_hi` | Output from a partial input with `skip_translated=true`. |
+| `out_mixed_should_fail` | Intentional negative test for mixed formats. |
+
+The writer uses overwrite mode. Existing output directory contents are removed at the start of a run.
+
+## Single Huge Files
+
+Curator readers are file-partition oriented. Do not add generic pandas chunking to the step by default.
+
+If the user has one huge file and Curator file partitioning is not enough, create a one-off pre-step that splits the file into homogeneous JSONL or Parquet shards, then run `translate/nemo_curator` on the shard directory.
+
+## Validation
+
+For every smoke run, verify:
+
+- Command exits 0.
+- Output files exist under `output_dir`.
+- Row count matches input when filtering is disabled.
+- Translated field exists and is non-empty.
+- Chat rows preserve `messages` shape and tool-call JSON.
+- Parquet outputs can be read with pandas.
+- Logs do not print API keys.
+
+For negative tests, verify:
+
+- Mixed JSONL and Parquet directories fail before translation.
+- Missing `source_language` or `target_language` fails clearly.
+- Missing LLM credentials fail before backend calls.
+- Missing NMT server fails with a clear health check or connection error.
+
+## Pipeline Placement
+
+Use translated outputs before downstream prep or training:
+
+```text
+translate/nemo_curator -> data_prep/sft_packing -> sft/megatron_bridge
+translate/nemo_curator -> sft/automodel
+translate/nemo_curator -> data_prep/pretrain_prep -> pretrain/*
+```
+
+After translating data for training, run a multilingual tokenizer check before packing or training so sequence length and template assumptions still hold.
diff --git a/src/nemotron/steps/translate/nemo_curator/SKILL.md b/src/nemotron/steps/translate/nemo_curator/SKILL.md
new file mode 100644
index 000000000..a27b794a4
--- /dev/null
+++ b/src/nemotron/steps/translate/nemo_curator/SKILL.md
@@ -0,0 +1,87 @@
+---
+name: nemotron-translate-curator
+description: Configure and run the concrete translate/nemo_curator step for JSONL or Parquet corpus translation with NeMo Curator's experimental TranslationStage.
+---
+
+# Translation Step (Curator backend)
+
+Use `translate/nemo_curator` for corpus translation. Before changing configs or code, read `step.toml` to understand the step contract, consumed artifacts, produced artifacts, parameters, strategies, and failure modes.
+
+## Inputs And Outputs
+
+- Consume homogeneous JSONL or Parquet records from `input_path`.
+- Translate `text_field`, such as `text` or `messages.*.content`.
+- Produce translated JSONL or Parquet shards under `output_dir`.
+- Use `output_mode=replaced` for training-ready data.
+- Use `output_mode=both` for audit data with metadata and scores.
+
+## Configure
+
+- Set `source_language` and `target_language` explicitly.
+- Set `backend` to `llm`, `nmt`, `google`, or `aws`.
+- For `backend=llm`, set `server.url`, `server.model`, and `server.api_key_env`.
+- For `backend=nmt`, set `nmt.server_url` and verify `/health` and `/translate`.
+- For structured chat, set `text_field='messages.*.content'` and `reconstruct_messages=true`.
+- For plain text, set `text_field=text` and `reconstruct_messages=false`.
+- For FAITH annotation, set `faith_eval.enabled=true` and usually `faith_eval.filter_enabled=false` first.
+- For FAITH filtering, confirm with the user because rows can be dropped.
+
+For one-shot or eval-style runs, emit an execution-ready handoff before deep
+exploration:
+
+- Step decision and scope: `translate/nemo_curator`, either translation-only or translation+FAITH.
+- Input policy: selected JSONL or Parquet path and excluded incompatible inputs.
+- Config: inline key fields or a config path.
+- Run: exact command.
+- Output: expected translated artifact path.
+- Env: required environment variable names only.
+
+If model variant or source-language evidence is incomplete, state explicit
+assumptions and continue with a runnable default rather than stalling. Prefer
+inline config in the response before optional file writes.
+
+## Local Files
+
+- Contract: `src/nemotron/steps/translate/nemo_curator/step.toml`
+- Runner: `src/nemotron/steps/translate/nemo_curator/step.py`
+- Config: `src/nemotron/steps/translate/nemo_curator/config/default.yaml`
+- Guide: `src/nemotron/steps/translate/guide.md`
+- Patterns: `src/nemotron/steps/patterns/translate-training-corpus.md`
+
+## Avoid
+
+- Do not mix JSONL and Parquet in one input directory.
+- Do not store API keys in config files.
+- Do not print environment values or run env-dump commands that may expose tokens or keys.
+- Do not use `merge_scores=true` with `output_mode=replaced`.
+- Do not treat `skip_translated=true` as output-directory resume.
+- Do not add custom chunking to `step.py` for normal use. Split huge single files before this step if needed.
+- Do not silently enable FAITH filtering for training data.
+
+## Validate
+
+- Import check: `uv run --no-sync python -c "from nemo_curator.stages.text.experimental.translation import TranslationStage; print(TranslationStage)"`.
+- Prompt resource check: verify `translate.yaml` and `faith_eval.yaml` exist under `nemo_curator.stages.text.experimental.translation.prompts`.
+- LLM smoke: translate two plain-text JSONL rows with `faith_eval.enabled=false`.
+- NMT smoke: call `GET /health`, then translate two rows with `backend=nmt`.
+- Chat smoke: translate `messages.*.content` and verify `tool_calls[].function.arguments` remains valid JSON.
+- If a command fails with CLI argument errors, return to the documented step command template before retrying.
+- If file validation fails with `FileNotFoundError`, re-check actual output paths and validate only existing files.
+
+## Runtime Prerequisites
+
+- Runtime dependencies must include parser/config basics such as `toml` and `pyyaml`.
+- If an eval container misses these packages, report the environment blocker
+  and still provide a complete handoff.
+- Do not end with blocker-only output when a command template and expected
+  output path can still be provided.
+
+## Completion Checklist
+
+Before ending a response, ensure these are explicitly present:
+
+1. Runnable config evidence.
+2. `Run` command and expected `Output` path.
+3. Required `Env` variable names, never values.
+4. For FAITH runs, `faith_eval.enabled=true` and expected FAITH outputs.
+5. Any environment/runtime blocker plus the workaround-ready command.
diff --git a/src/nemotron/steps/translate/translation/config/default.yaml b/src/nemotron/steps/translate/nemo_curator/config/default.yaml
similarity index 80%
rename from src/nemotron/steps/translate/translation/config/default.yaml
rename to src/nemotron/steps/translate/nemo_curator/config/default.yaml
index aee2b39ac..6f534b5c4 100644
--- a/src/nemotron/steps/translate/translation/config/default.yaml
+++ b/src/nemotron/steps/translate/nemo_curator/config/default.yaml
@@ -1,5 +1,10 @@
 # Starter config for NeMo Curator corpus translation.
 
+run:
+  env:
+    mounts:
+      - ${auto_mount:git+https://github.com/NVIDIA-NeMo/Curator.git@d10cd6ffe9f5ac4cbb176d7b3ada698f22633aea,/opt/Curator}
+
 input_path: /path/to/filtered_data.jsonl
 output_dir: ./output/translated
 
@@ -23,6 +28,7 @@ messages_content_field: content
 segmentation_mode: coarse # coarse | fine
 min_segment_chars: 0
 max_concurrent_requests: 64
+generation_config: null   # Optional OpenAI-compatible translation generation settings.
 skip_translated: false
 files_per_partition: null
 blocksize: null
@@ -37,8 +43,11 @@ faith_eval:
   enabled: true
   threshold: 2.5
   model_name: ""
-  segment_level: true
   filter_enabled: true
+  max_concurrent_requests: 64
+  generation_config:
+    max_tokens: 2048
+    temperature: 0.0
 
 nmt:
   server_url: http://localhost:5000
diff --git a/src/nemotron/steps/translate/translation/step.py b/src/nemotron/steps/translate/nemo_curator/step.py
similarity index 78%
rename from src/nemotron/steps/translate/translation/step.py
rename to src/nemotron/steps/translate/nemo_curator/step.py
index 22a0fb2be..6da4ec5e4 100644
--- a/src/nemotron/steps/translate/translation/step.py
+++ b/src/nemotron/steps/translate/nemo_curator/step.py
@@ -2,7 +2,7 @@
 # /// script
 # [tool.runspec]
 # schema = "1"
-# name = "steps/translate/translation"
+# name = "steps/translate/nemo_curator"
 #
 # [tool.runspec.run]
 # launch = "python"
@@ -31,6 +31,17 @@
 
 DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
 log = logging.getLogger(__name__)
+_GENERATION_CONFIG_KEYS = {
+    "extra_kwargs",
+    "max_tokens",
+    "n",
+    "seed",
+    "stop",
+    "stream",
+    "temperature",
+    "top_k",
+    "top_p",
+}
 
 
 def _required_path(config: dict[str, Any], key: str) -> str:
@@ -139,6 +150,45 @@ def _backend_config(config: dict[str, Any]) -> dict[str, Any]:
     return {}
 
 
+def _build_generation_config(raw_config: Any) -> Any | None:
+    if raw_config is None:
+        return None
+    if not isinstance(raw_config, dict):
+        raise ValueError("generation_config must be a mapping")
+
+    from nemo_curator.models.client.llm_client import GenerationConfig
+
+    generation_kwargs: dict[str, Any] = {}
+    extra_kwargs = dict(raw_config.get("extra_kwargs") or {})
+    for key, value in raw_config.items():
+        if key == "extra_kwargs":
+            continue
+        if key in _GENERATION_CONFIG_KEYS:
+            generation_kwargs[key] = value
+        else:
+            extra_kwargs[key] = value
+
+    if extra_kwargs:
+        generation_kwargs["extra_kwargs"] = extra_kwargs
+    return GenerationConfig(**generation_kwargs)
+
+
+def _configure_faith_stage(stage: Any, faith_cfg: dict[str, Any]) -> None:
+    generation_config = _build_generation_config(faith_cfg.get("generation_config"))
+    max_concurrent_requests = faith_cfg.get("max_concurrent_requests")
+
+    if generation_config is None and max_concurrent_requests is None:
+        return
+
+    for execution_stage in stage.decompose():
+        if getattr(execution_stage, "name", "") != "FaithEvalFilter":
+            continue
+        if generation_config is not None:
+            execution_stage.generation_config = generation_config
+        if max_concurrent_requests is not None:
+            execution_stage.max_concurrent_requests = int(max_concurrent_requests)
+
+
 def _text_field(value: Any) -> str | list[str]:
     if isinstance(value, list):
         return [str(item) for item in value]
@@ -146,13 +196,13 @@ def _text_field(value: Any) -> str | list[str]:
 
 
 def _build_translation_stage(config: dict[str, Any]) -> Any:
-    from nemo_curator.stages.text.translation import TranslationStage
+    from nemo_curator.stages.text.experimental.translation import TranslationStage
 
     faith_cfg = config.get("faith_eval", {}) or {}
     enable_faith = bool(faith_cfg.get("enabled", False))
     server = config.get("server", {}) or {}
 
-    return TranslationStage(
+    stage = TranslationStage(
         source_lang=str(config["source_language"]).lower(),
         target_lang=str(config["target_language"]).lower(),
         text_field=_text_field(config.get("text_field", "messages.*.content")),
@@ -161,12 +211,12 @@ def _build_translation_stage(config: dict[str, Any]) -> Any:
         min_segment_chars=int(config.get("min_segment_chars", 0)),
         client=_build_curator_client(config, enable_faith=enable_faith),
         model_name=str(server.get("model") or ""),
+        generation_config=_build_generation_config(config.get("generation_config")),
         backend_type=str(config.get("backend", "llm")),
         backend_config=_backend_config(config),
         enable_faith_eval=enable_faith,
         faith_threshold=float(faith_cfg.get("threshold", 2.5)),
         faith_model_name=str(faith_cfg.get("model_name") or server.get("model") or ""),
-        segment_level=bool(faith_cfg.get("segment_level", False)),
         filter_enabled=bool(faith_cfg.get("filter_enabled", True)),
         output_mode=str(config.get("output_mode", "both")),
         merge_scores=bool(config.get("merge_scores", True)),
@@ -176,6 +226,9 @@ def _build_translation_stage(config: dict[str, Any]) -> Any:
         skip_translated=bool(config.get("skip_translated", False)),
         translation_column=str(config.get("translation_column", "translated_text")),
     )
+    if enable_faith:
+        _configure_faith_stage(stage, faith_cfg)
+    return stage
 
 
 def run(config: dict[str, Any]) -> Path:
diff --git a/src/nemotron/steps/translate/translation/step.toml b/src/nemotron/steps/translate/nemo_curator/step.toml
similarity index 73%
rename from src/nemotron/steps/translate/translation/step.toml
rename to src/nemotron/steps/translate/nemo_curator/step.toml
index 23a56c3bc..c07bb83ef 100644
--- a/src/nemotron/steps/translate/translation/step.toml
+++ b/src/nemotron/steps/translate/nemo_curator/step.toml
@@ -1,9 +1,9 @@
 [step]
-id = "translate/translation"
-name = "Corpus Translation + FAITH Scoring"
+id = "translate/nemo_curator"
+name = "Corpus Translation + FAITH Scoring (NeMo Curator)"
 category = "translate"
 description = """Translate JSONL or Parquet training corpora with NeMo Curator's TranslationStage, preserving structured fields and optionally attaching FAITH quality scores."""
-tags = ["translate", "nemo-curator", "faith", "multilingual", "jsonl", "parquet"]
+tags = ["translate", "translation", "nemo-curator", "faith", "multilingual", "jsonl", "parquet"]
 
 [[consumes]]
 type = "filtered_jsonl"
@@ -34,6 +34,18 @@ name = "text_field"
 description = "Source field path to translate. Use messages.*.content for OpenAI-style chat records."
 default = "messages.*.content"
 
+[[parameters]]
+name = "input_format"
+description = "Input format: auto infers homogeneous JSONL/Parquet paths; set explicitly when globs or directories are ambiguous."
+default = "auto"
+choices = ["auto", "jsonl", "parquet"]
+
+[[parameters]]
+name = "output_format"
+description = "Output shard format written by Curator."
+default = "jsonl"
+choices = ["jsonl", "parquet"]
+
 [[parameters]]
 name = "backend"
 description = "Translation backend. Use llm for OpenAI-compatible endpoints, nmt for a local service, or cloud backends when configured."
@@ -63,44 +75,55 @@ default = "both"
 choices = ["replaced", "raw", "both"]
 
 [[parameters]]
-name = "faith_enabled"
+name = "faith_eval.enabled"
 description = "Whether to run FAITH translation quality scoring through an LLM client."
 default = true
 
 [[parameters]]
-name = "faith_threshold"
+name = "faith_eval.threshold"
 description = "Minimum FAITH average score to keep translated records when filtering is enabled."
 default = 2.5
 
+[[parameters]]
+name = "reconstruct_messages"
+description = "For chat records, emit reconstructed translated messages for inspection and downstream SFT prep."
+default = true
+
+[[parameters]]
+name = "segmentation_mode"
+description = "Translation segmentation mode. Start coarse for JSON, code, tool payloads, or structured chat records."
+default = "coarse"
+choices = ["coarse", "fine"]
+
 [[strategies]]
 when = "The input records are OpenAI-style chat data"
 then = "Set text_field=messages.*.content and reconstruct_messages=true so translated_messages is emitted for inspection."
-skill = "skills/nemotron-customize/context/curator-translation-faith.txt"
+skill = "skills/nemotron-customize/references/context/curator-translation-faith.txt"
 
 [[strategies]]
 when = "The corpus is large and a local translation service is available"
 then = "Use backend=nmt with nmt.server_url and keep Curator reader/writer stages as the default I/O path."
-skill = "skills/nemotron-customize/context/curator-translation-faith.txt"
+skill = "skills/nemotron-customize/references/context/curator-translation-faith.txt"
 
 [[strategies]]
 when = "The user selects a hosted LLM backend"
 then = "Ask for the exact live model name and verify it is available before a real run; hosted models can be retired."
-skill = "skills/nemotron-customize/context/curator-translation-faith.txt"
+skill = "skills/nemotron-customize/references/context/curator-translation-faith.txt"
 
 [[strategies]]
 when = "The user selects Google or AWS translation"
 then = "Keep secrets out of config files. Use cloud provider environment credentials and set only project/location or region in YAML."
-skill = "skills/nemotron-customize/context/curator-translation-faith.txt"
+skill = "skills/nemotron-customize/references/context/curator-translation-faith.txt"
 
 [[strategies]]
 when = "The corpus contains JSON, code, tool payloads, or structured chat messages"
 then = "Keep segmentation_mode=coarse first; Curator preserves valid JSON and fenced code while translating natural-language segments."
-skill = "skills/nemotron-customize/context/curator-translation-faith.txt"
+skill = "skills/nemotron-customize/references/context/curator-translation-faith.txt"
 
 [[strategies]]
 when = "Translation quality must gate SFT data"
 then = "Enable faith_eval and set output_mode=both so scores and translation metadata remain available for downstream review."
-skill = "skills/nemotron-customize/context/curator-translation-faith.txt"
+skill = "skills/nemotron-customize/references/context/curator-translation-faith.txt"
 
 [[errors]]
 name = "missing_llm_endpoint"
@@ -131,8 +154,9 @@ name = "single_huge_file_memory_pressure"
 recovery = "Curator readers are file-partition oriented. For one huge file, generate a one-off row-chunking pre-step and feed chunks into this step."
 
 [reference]
-script = "src/nemotron/steps/translate/translation/step.py"
-docs = "docs/customize/steps/translate/translation.md"
+skill = "src/nemotron/steps/translate/nemo_curator/SKILL.md"
+script = "src/nemotron/steps/translate/nemo_curator/step.py"
+docs = "docs/customize/steps/translate/nemo-curator.md"
 skills = [
-  "skills/nemotron-customize/context/curator-translation-faith.txt",
+  "skills/nemotron-customize/references/context/curator-translation-faith.txt",
 ]
diff --git a/src/nemotron/steps/translate/nemo_skills/SKILL.md b/src/nemotron/steps/translate/nemo_skills/SKILL.md
deleted file mode 100644
index e4d7a5093..000000000
--- a/src/nemotron/steps/translate/nemo_skills/SKILL.md
+++ /dev/null
@@ -1,49 +0,0 @@
----
-name: nemotron-translate-nemo-skills
-description: Configure Nemotron translate/nemo_skills to translate filtered_jsonl into a target language using NeMo Skills (Speaker) and attach FAITH quality signals. Use for multilingual SFT data prep, NIM/vLLM-served translation models, faith-threshold filtering, and segmented-message preservation.
----
-
-# Translation + FAITH Scoring (NeMo Skills)
-
-Use `translate/nemo_skills` to localize `filtered_jsonl` and gate by FAITH
-score before downstream training.
-
-Read `step.toml` for full strategies, errors, and parameter choices.
-
-## Inputs and outputs
-
-- Consume: `filtered_jsonl`.
-- Produce: `translated_jsonl` (target-language messages + FAITH metadata).
-
-## Configure
-
-- **Backend**:
-  - `nim` for production-ready managed endpoints (default).
-  - `vllm` for self-hosted or local checkpoint evaluation.
-  - Either way, the model is reached over OpenAI-compatible chat endpoints —
-    `missing_openai_endpoint` recovery is "provision a reachable endpoint and
-    pass server address + model name into the Speaker config."
-- **Faith threshold**: `0.7` keeps high-confidence translations. Set to `0.0`
-  to score without filtering (useful for analysis runs).
-- **Long or structured messages**: enable fine segmentation so Speaker keeps
-  separators and reconstructs structure faithfully.
-- **FAITH must run on raw Speaker output**, not on a downstream JSONL that's
-  already dropped `translations.segmented_translation` metadata. If you see
-  `faith_input_missing_segmented_translation`, you regressed the input.
-- Reference [src/nemotron/steps/patterns/multilingual-tokenizer-check.md](../../patterns/multilingual-tokenizer-check.md)
-  before chaining into a multilingual SFT step.
-
-## Local files
-
-- Contract: [step.toml](step.toml)
-- Runner: [step.py](step.py)
-- Configs: `config/default.yaml`, `config/tiny.yaml`
-
-## Guardrails
-
-- Don't lower `faith_threshold` to "rescue" rejected records — the threshold
-  is a quality gate, not a quota knob.
-- Inspect a sample of translated records before training; FAITH is a useful
-  signal, not a proof of meaning preservation.
-- Don't run translation and SFT off different model lineages without a
-  tokenizer-coverage check on the target language.
diff --git a/src/nemotron/steps/translate/nemo_skills/config/default.yaml b/src/nemotron/steps/translate/nemo_skills/config/default.yaml
deleted file mode 100644
index bf9c758ae..000000000
--- a/src/nemotron/steps/translate/nemo_skills/config/default.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Starter config for LLM translation + FAITH scoring.
-# backend selects the serving style; both NIM and vLLM should expose
-# an OpenAI-compatible endpoint for Speaker.
-
-input_file: /path/to/filtered_data.jsonl
-data_directory: ./output/translation_run
-source_language: EN
-target_language: TARGET_LANG
-faith_threshold: 0.7
-backend: nim
-
-server:
-  address: http://nim-llm:8000/v1
-  translation_model: nvidia/llama-3.3-nemotron-super-49b-v1.5
-  faith_model: nvidia/llama-3.3-nemotron-super-49b-v1.5
-
-translation:
-  fields_to_translate:
-    - messages.*.content
-  translation_key: translations
-  prompt_config: /workspace/speaker/src/speaker/driver/translate/prompts/prompt_translate.yaml
-
-faith_eval:
-  fields_to_evaluate:
-    - messages.*.content
-  scores_key: faith_scores
-  prompt_config: /workspace/speaker/src/speaker/driver/translate/prompts/prompt_faith_eval.yaml
-
-processing:
-  num_chunks: 1
-  max_concurrent_requests: 16
-  batch_size: 1
-
-inference:
-  temperature: 0.7
-  top_k: -1
-  top_p: 0.8
-
-faith_inference:
-  temperature: 0.0
-  top_k: -1
-  top_p: 1.0
diff --git a/src/nemotron/steps/translate/nemo_skills/step.py b/src/nemotron/steps/translate/nemo_skills/step.py
deleted file mode 100644
index 62127cf43..000000000
--- a/src/nemotron/steps/translate/nemo_skills/step.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/usr/bin/env python3
-# /// script
-# [tool.runspec]
-# schema = "1"
-# name = "steps/translate/nemo_skills"
-#
-# [tool.runspec.run]
-# launch = "python"
-#
-# [tool.runspec.config]
-# dir = "./config"
-# default = "default"
-# format = "yaml"
-#
-# [tool.runspec.resources]
-# nodes = 1
-# gpus_per_node = 0
-# ///
-
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Thin Speaker translation wrapper; full drivers: `speaker/src/speaker/driver/translate/`."""
-
-from __future__ import annotations
-
-import argparse
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-
-import yaml
-
-DEFAULT_CONFIG = Path(__file__).parent / "config" / "default.yaml"
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--config", type=Path, default=DEFAULT_CONFIG)
-    cfg = yaml.safe_load(parser.parse_args().config.read_text())
-    server = cfg["server"]
-
-    translate_cfg = {
-        "model_type": "llm",
-        "server": {
-            "type": "openai",
-            "model": server["translation_model"],
-            "address": server.get("address") or "http://nim-llm:8000/v1",
-        },
-        "translation": {
-            "source_lang": cfg["source_language"].lower(),
-            "target_lang": cfg["target_language"].lower(),
-            "fields_to_translate": cfg["translation"]["fields_to_translate"],
-            "translation_key": cfg["translation"]["translation_key"],
-            "prompt_config": cfg["translation"]["prompt_config"],
-        },
-        "processing": cfg["processing"],
-        "inference": cfg["inference"],
-    }
-    faith_cfg = {
-        "server": {
-            "type": "openai",
-            "model": server["faith_model"],
-            "address": server.get("address") or "http://nim-llm:8000/v1",
-        },
-        "faith_eval": {
-            "source_lang": cfg["source_language"].lower(),
-            "target_lang": cfg["target_language"].lower(),
-            "fields_to_evaluate": cfg["faith_eval"]["fields_to_evaluate"],
-            "translation_key": cfg["translation"]["translation_key"],
-            "scores_key": cfg["faith_eval"]["scores_key"],
-            "prompt_config": cfg["faith_eval"]["prompt_config"],
-        },
-        "processing": {
-            "num_chunks": cfg["processing"]["num_chunks"],
-            "max_concurrent_requests": cfg["processing"]["max_concurrent_requests"],
-        },
-        "inference": cfg["faith_inference"],
-    }
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        translate_path = Path(tmpdir) / "translate.yaml"
-        faith_path = Path(tmpdir) / "faith.yaml"
-        translate_path.write_text(yaml.safe_dump(translate_cfg, sort_keys=False))
-        faith_path.write_text(yaml.safe_dump(faith_cfg, sort_keys=False))
-
-        subprocess.run(
-            [
-                sys.executable,
-                "-m",
-                "speaker.driver.translate.pipeline_translate",
-                "--config",
-                str(translate_path),
-                "--data-directory",
-                cfg["data_directory"],
-                "--input-file",
-                cfg["input_file"],
-            ],
-            check=True,
-        )
-        subprocess.run(
-            [
-                sys.executable,
-                "-m",
-                "speaker.driver.translate.pipeline_faith_eval",
-                "--config",
-                str(faith_path),
-                "--data-directory",
-                cfg["data_directory"],
-            ],
-            check=True,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/nemotron/steps/translate/nemo_skills/step.toml b/src/nemotron/steps/translate/nemo_skills/step.toml
deleted file mode 100644
index c4e8c8d59..000000000
--- a/src/nemotron/steps/translate/nemo_skills/step.toml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[step]
-id = "translate/nemo_skills"
-name = "Translation + FAITH Scoring (NeMo Skills)"
-category = "translate"
-description = """Translate filtered JSONL into a target language with NeMo Skills and attach FAITH-based quality signals so downstream steps can keep high-faith training data."""
-tags = ["translate", "nemo-skills", "faith", "multilingual", "jsonl"]
-
-[[consumes]]
-type = "filtered_jsonl"
-description = "Filtered JSONL records whose message fields should be translated."
-
-[[produces]]
-type = "translated_jsonl"
-description = "Translated JSONL with target-language messages plus FAITH quality metadata for downstream use."
-
-[[parameters]]
-name = "target_language"
-description = "Target ISO 639-1 language code for translation output."
-default = "hi"
-
-[[parameters]]
-name = "source_language"
-description = "Source ISO 639-1 language code expected in the input JSONL."
-default = "en"
-
-[[parameters]]
-name = "faith_threshold"
-description = "Minimum normalized FAITH quality threshold to keep translated records; set to 0.0 to score without filtering."
-default = 0.7
-
-[[parameters]]
-name = "backend"
-description = "Serving backend exposed through an OpenAI-compatible endpoint for the translation model."
-default = "nim"
-choices = ["nim", "vllm"]
-
-[[strategies]]
-when = "You want a production-ready managed endpoint"
-then = "Use backend=nim and point Speaker at the NIM OpenAI-compatible chat endpoint."
-
-[[strategies]]
-when = "You are self-hosting the model or evaluating local checkpoint variants"
-then = "Use backend=vllm and provide a compatible OpenAI-style server address for translation and FAITH scoring."
-
-[[strategies]]
-when = "Long or structured messages are getting truncated or reordered"
-then = "Enable fine segmentation so Speaker preserves separators and reconstructs the original message structure more faithfully."
-
-[[strategies]]
-when = "You need to filter translations by quality before SFT"
-then = "Run FAITH scoring on the raw translation output and drop records below faith_threshold before exporting translated_jsonl."
-
-[[errors]]
-name = "missing_openai_endpoint"
-recovery = "Provision a reachable NIM or vLLM OpenAI-compatible endpoint and pass the correct server address and model name into the Speaker config."
-
-[[errors]]
-name = "faith_input_missing_segmented_translation"
-recovery = "Run FAITH on Speaker's raw translation output, not on a final JSONL that has already discarded the translations.segmented_translation metadata."
-
-[[errors]]
-name = "field_path_mismatch"
-recovery = "Make sure the translation fields_to_translate path matches the actual JSONL schema, usually messages.*.content for chat data."
-
-[reference]
-script = "https://github.com/NVIDIA-NeMo/Skills"
-docs = "https://nvidia.github.io/NeMo-Skills/"
diff --git a/tests/deploy/test_airgap_runner.py b/tests/deploy/test_airgap_runner.py
new file mode 100644
index 000000000..33efa004b
--- /dev/null
+++ b/tests/deploy/test_airgap_runner.py
@@ -0,0 +1,376 @@
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+from omegaconf import OmegaConf
+
+from nemo_runspec.config.loader import load_config
+
+
+def _runner_module():
+    repo_root = Path(__file__).resolve().parents[2]
+    path = repo_root / "deploy/nemotron-customizer/airgap/runner.py"
+    spec = importlib.util.spec_from_file_location("airgap_runner", path)
+    assert spec and spec.loader
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_airgap_runner_expands_and_validates_sft_dependency():
+    runner = _runner_module()
+    cfg = {
+        "workflow": {"stages": ["sft/megatron_bridge:tiny"]},
+        "dependencies": {"sft/megatron_bridge": ["data_prep/sft_packing:tiny"]},
+    }
+
+    targets = runner.expand_targets(cfg)
+    infos = runner.validate_targets(targets)
+
+    assert [target.spec for target in targets] == ["data_prep/sft_packing:tiny", "sft/megatron_bridge:tiny"]
+    assert infos["sft/megatron_bridge"].module == "nemotron.steps.sft.megatron_bridge.step"
+    assert infos["data_prep/sft_packing"].config_path.name == "tiny.yaml"
+    assert [item.target for item in infos["sft/megatron_bridge"].repo_overlays] == [
+        "/opt/megatron-lm",
+        "/opt/Megatron-Bridge",
+    ]
+
+
+def test_airgap_runner_groups_execution_images_by_base_image_and_repo_overlays(tmp_path):
+    runner = _runner_module()
+    overlay = runner.RepoOverlay(
+        repo="Megatron-Bridge",
+        url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+        ref="main",
+        target="/opt/Megatron-Bridge",
+    )
+    cfg = {
+        "step_execution_images": {
+            "data_prep/sft_packing": "a",
+            "sft/megatron_bridge": "b",
+        },
+        "execution_images": {
+            "a": {
+                "base_image": "image:base",
+                "tag": "image:a",
+                "tar": "a.tar",
+                "required_imports": ["omegaconf"],
+            },
+            "b": {
+                "base_image": "image:base",
+                "tag": "image:b",
+                "tar": "b.tar",
+                "required_imports": ["yaml"],
+            },
+        },
+    }
+
+    groups = runner.execution_groups(
+        cfg,
+        output_dir=tmp_path,
+        step_infos={
+            "data_prep/sft_packing": runner.StepInfo(
+                target=runner.Target("data_prep/sft_packing"),
+                step_dir=tmp_path,
+                step_py=tmp_path / "step.py",
+                step_toml=tmp_path / "step.toml",
+                config_path=None,
+                module="x",
+            ),
+            "sft/megatron_bridge": runner.StepInfo(
+                target=runner.Target("sft/megatron_bridge"),
+                step_dir=tmp_path,
+                step_py=tmp_path / "step.py",
+                step_toml=tmp_path / "step.toml",
+                config_path=None,
+                module="y",
+                repo_overlays=[overlay],
+            ),
+        },
+    )
+
+    assert len(groups) == 2
+    by_step = {group.steps[0]: group for group in groups}
+    assert by_step["data_prep/sft_packing"].base_image == "image:base"
+    assert by_step["data_prep/sft_packing"].required_imports == {"omegaconf"}
+    assert by_step["data_prep/sft_packing"].repo_overlays == []
+    assert by_step["sft/megatron_bridge"].base_image == "image:base"
+    assert by_step["sft/megatron_bridge"].required_imports == {"yaml"}
+    assert by_step["sft/megatron_bridge"].repo_overlays == [overlay]
+    assert len({group.tag for group in groups}) == 2
+
+
+def test_airgap_runner_only_builds_images_for_selected_steps(tmp_path):
+    runner = _runner_module()
+    cfg = {
+        "step_execution_images": {
+            "data_prep/sft_packing": "nemo-megatron",
+            "sft/automodel": "nemo-automodel",
+        },
+        "execution_images": {
+            "nemo-megatron": {"base_image": "nemo:base"},
+            "nemo-automodel": {"base_image": "automodel:base"},
+        },
+    }
+
+    groups = runner.execution_groups(cfg, output_dir=tmp_path, step_infos={"data_prep/sft_packing": object()})
+
+    assert len(groups) == 1
+    assert groups[0].name.startswith("nemo-megatron-")
+    assert groups[0].steps == ["data_prep/sft_packing"]
+
+
+def test_airgap_runner_maps_sdg_to_light_sdk_image(tmp_path):
+    runner = _runner_module()
+    cfg = runner.load_yaml(runner.AIRGAP_DIR / "airgap.yaml")
+    cfg["workflow"]["stages"] = ["sdg/data_designer:tiny"]
+
+    targets = runner.expand_targets(cfg)
+    infos = runner.validate_targets(targets)
+    groups = runner.execution_groups(cfg, output_dir=tmp_path, step_infos=infos)
+
+    assert [target.spec for target in targets] == ["sdg/data_designer:tiny"]
+    assert len(groups) == 1
+    assert groups[0].name.startswith("nemo-data-designer-")
+    assert groups[0].base_image == "nvcr.io/nvidia/nemo:25.11.nemotron_3_nano"
+    assert "data_designer" in groups[0].required_imports
+
+
+def test_airgap_runner_maps_byob_to_data_designer_image(tmp_path):
+    runner = _runner_module()
+    cfg = runner.load_yaml(runner.AIRGAP_DIR / "airgap.yaml")
+    cfg["workflow"]["stages"] = ["byob:tiny"]
+
+    targets = runner.expand_targets(cfg)
+    infos = runner.validate_targets(targets)
+    groups = runner.execution_groups(cfg, output_dir=tmp_path, step_infos=infos)
+
+    assert [target.spec for target in targets] == ["byob:tiny"]
+    assert len(groups) == 1
+    assert groups[0].name.startswith("nemo-data-designer-")
+    assert "data_designer" in groups[0].required_imports
+
+
+def test_airgap_runner_target_override_selects_sdg_and_sft():
+    runner = _runner_module()
+    cfg = runner.load_yaml(runner.AIRGAP_DIR / "airgap.yaml")
+    cfg = runner.with_workflow_targets(
+        cfg,
+        runner.normalize_target_specs(["sdg/data_designer:tiny", "sft/megatron_bridge:tiny"]),
+    )
+
+    targets = runner.expand_targets(cfg)
+    infos = runner.validate_targets(targets)
+    groups = runner.execution_groups(cfg, output_dir=runner.AIRGAP_DIR / "out", step_infos=infos)
+
+    assert [target.spec for target in targets] == [
+        "sdg/data_designer:tiny",
+        "data_prep/sft_packing:tiny",
+        "sft/megatron_bridge:tiny",
+    ]
+    by_steps = {tuple(group.steps): group for group in groups}
+    merged = by_steps[("sdg/data_designer", "data_prep/sft_packing")]
+    assert merged.image_names == {"nemo-data-designer", "nemo-megatron"}
+    assert merged.tag.startswith("nemotron-customizer-nemo-data-designer-nemo-megatron-airgap-")
+
+
+def test_airgap_runner_splits_same_base_image_when_repo_overlays_differ(tmp_path):
+    runner = _runner_module()
+    overlay = runner.RepoOverlay(
+        repo="Megatron-Bridge",
+        url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+        ref="feature",
+        target="/opt/Megatron-Bridge",
+    )
+    cfg = {
+        "step_execution_images": {
+            "data_prep/sft_packing": "nemo-megatron",
+            "sft/megatron_bridge": "nemo-megatron",
+        },
+        "execution_images": {
+            "nemo-megatron": {
+                "base_image": "nemo:base",
+                "tag": "nemo-airgap:latest",
+                "tar": "nemo-airgap.tar",
+            },
+        },
+    }
+    groups = runner.execution_groups(
+        cfg,
+        output_dir=tmp_path,
+        step_infos={
+            "data_prep/sft_packing": runner.StepInfo(
+                target=runner.Target("data_prep/sft_packing"),
+                step_dir=tmp_path,
+                step_py=tmp_path / "step.py",
+                step_toml=tmp_path / "step.toml",
+                config_path=None,
+                module="x",
+            ),
+            "sft/megatron_bridge": runner.StepInfo(
+                target=runner.Target("sft/megatron_bridge"),
+                step_dir=tmp_path,
+                step_py=tmp_path / "step.py",
+                step_toml=tmp_path / "step.toml",
+                config_path=None,
+                module="y",
+                repo_overlays=[overlay],
+            ),
+        },
+    )
+
+    assert len(groups) == 2
+    assert sorted([group.steps for group in groups]) == [["data_prep/sft_packing"], ["sft/megatron_bridge"]]
+    assert len({group.tag for group in groups}) == 2
+
+
+def test_airgap_runner_uses_collision_safe_repo_overlay_dirs():
+    runner = _runner_module()
+    first = runner.RepoOverlay(
+        repo="Megatron-Bridge",
+        url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+        ref="main",
+        target="/opt/Megatron-Bridge",
+    )
+    second = runner.RepoOverlay(
+        repo="Megatron-Bridge",
+        url="https://github.com/example/Megatron-Bridge.git",
+        ref="main",
+        target="/opt/Other-Bridge",
+    )
+
+    assert runner.repo_overlay_dir_name(first) != runner.repo_overlay_dir_name(second)
+    assert runner.repo_overlay_build_manifest(first)["source"] == runner.repo_overlay_dir_name(first)
+
+
+def test_airgap_runner_auto_adds_stage_prerequisites():
+    runner = _runner_module()
+
+    assert runner.normalize_stages(["build-execution-images"]) == [
+        "validate",
+        "discover-execution-deps",
+        "build-execution-images",
+    ]
+    assert runner.normalize_stages(["save-images"]) == [
+        "validate",
+        "discover-execution-deps",
+        "build-launcher-image",
+        "build-execution-images",
+        "save-images",
+    ]
+
+
+def test_airgap_runner_rejects_build_output_outside_docker_context(tmp_path):
+    runner = _runner_module()
+
+    with pytest.raises(SystemExit, match="paths.output_dir=.*must live under the repo root"):
+        runner.validate_docker_context_path(tmp_path, field="paths.output_dir")
+
+
+def test_airgap_runner_reports_dependency_cycles():
+    runner = _runner_module()
+    cfg = {
+        "workflow": {"stages": ["a/b"]},
+        "dependencies": {
+            "a/b": ["c/d"],
+            "c/d": ["a/b"],
+        },
+    }
+
+    with pytest.raises(SystemExit, match=r"cyclic airgap dependency detected: a/b -> c/d -> a/b"):
+        runner.expand_targets(cfg)
+
+
+def test_airgap_runner_tag_suffix_handles_ports_and_digests():
+    runner = _runner_module()
+
+    assert runner.tag_with_suffix("registry:5000/team/image:latest", "abc123") == (
+        "registry:5000/team/image-abc123:latest"
+    )
+    assert runner.tag_with_suffix("repo/image:latest@sha256:deadbeef", "abc123") == (
+        "repo/image-abc123:latest@sha256:deadbeef"
+    )
+    assert runner.tag_with_suffix("repo/image@sha256:deadbeef", "abc123") == "repo/image-abc123@sha256:deadbeef"
+
+
+def test_airgap_runner_saved_image_manifest_has_checksum(tmp_path):
+    runner = _runner_module()
+    image_tar = tmp_path / "image.tar"
+    image_tar.write_text("image bytes", encoding="utf-8")
+
+    saved = runner.saved_image_manifest("image:tag", image_tar, execute=True, role="execution", name="group")
+
+    assert saved["role"] == "execution"
+    assert saved["name"] == "group"
+    assert saved["image"] == "image:tag"
+    assert saved["tar"] == str(image_tar)
+    assert saved["sha256"] == runner.sha256_file(image_tar)
+
+
+def test_airgap_runner_platform_matching_accepts_variant_only_when_compatible():
+    runner = _runner_module()
+
+    assert runner.platform_matches("linux/amd64", "linux/amd64")
+    assert runner.platform_matches("linux/arm64/v8", "linux/arm64")
+    assert not runner.platform_matches("linux/amd64", "linux/arm64")
+    assert runner.pip_cache_volume("linux/amd64") == "nemotron-airgap-pip-cache-linux-amd64"
+
+
+def test_airgap_runner_progress_state_resumes_and_completes(tmp_path):
+    runner = _runner_module()
+    cfg = {"workflow": {"stages": ["byob:tiny"]}}
+    config_path = tmp_path / "airgap.yaml"
+    stages = ["validate"]
+
+    state = runner.load_or_start_run_state(
+        tmp_path,
+        config_path=config_path,
+        cfg=cfg,
+        stages=stages,
+        execute=True,
+    )
+    assert state is not None
+    runner.begin_action(state, "validate")
+    assert state.path.exists()
+    assert not state.done_path.exists()
+
+    runner.complete_action(state, "validate", {"targets": ["byob:tiny"]})
+    resumed = runner.load_or_start_run_state(
+        tmp_path,
+        config_path=config_path,
+        cfg=cfg,
+        stages=stages,
+        execute=True,
+    )
+    assert runner.action_completed(resumed, "validate")
+
+    manifest = tmp_path / "airgap-manifest.yaml"
+    manifest.write_text("schema_version: 1\n", encoding="utf-8")
+    runner.complete_run_state(resumed, manifest_path=manifest)
+
+    assert not state.path.exists()
+    assert state.done_path.exists()
+
+
+def test_airgap_runner_static_import_scan_stays_direct():
+    runner = _runner_module()
+    step_py = runner.STEP_ROOT / "data_prep/sft_packing/step.py"
+
+    imports = runner.discover_external_imports(step_py)
+
+    assert "omegaconf" in imports
+    assert "cosmos_xenna" not in imports
+
+
+def test_sft_airgap_overlay_clears_auto_mounts_but_inherits_config():
+    runner = _runner_module()
+    config = load_config(runner.AIRGAP_DIR / "configs/sft_megatron_bridge_tiny.yaml")
+    plain = OmegaConf.to_container(config, resolve=False)
+
+    assert plain["run"]["env"]["mounts"] == []
+    assert plain["hf_model_path"] == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
+    assert plain["dataset"]["packed_sequence_specs"]["packed_sequence_size"] == 4096
diff --git a/tests/nemo_runspec/test_data_mover.py b/tests/nemo_runspec/test_data_mover.py
index b3ea7d150..b2c2c218f 100644
--- a/tests/nemo_runspec/test_data_mover.py
+++ b/tests/nemo_runspec/test_data_mover.py
@@ -80,19 +80,18 @@ def test_auto_includes_scopes_to_active_step_subtree(tmp_path):
     (steps / "sft" / "automodel").mkdir(parents=True)
     (steps / "sft" / "__init__.py").write_text("")
     (steps / "sft" / "automodel" / "step.py").write_text("")
-    (steps / "prep").mkdir()
-    (steps / "prep" / "__init__.py").write_text("")
-    (steps / "prep" / "_common.py").write_text("")
-    (steps / "prep" / "sft_packing").mkdir()
-    (steps / "prep" / "sft_packing" / "step.py").write_text("")
+    (steps / "data_prep").mkdir()
+    (steps / "data_prep" / "__init__.py").write_text("")
+    (steps / "data_prep" / "_common.py").write_text("")
+    (steps / "data_prep" / "sft_packing").mkdir()
+    (steps / "data_prep" / "sft_packing" / "step.py").write_text("")
     (steps / "rl" / "nemo_rl").mkdir(parents=True)
     (steps / "rl" / "nemo_rl" / "step.py").write_text("")
 
     includes = _auto_includes(tmp_path, script_path="src/nemotron/steps/sft/automodel/step.py")
 
     assert "src/nemotron/steps/index.py" in includes
-    assert "src/nemotron/steps/sft/__init__.py" in includes
-    assert "src/nemotron/steps/sft/automodel" in includes
+    assert "src/nemotron/steps/sft" in includes
     assert "src/nemotron/steps/_runners" in includes
     assert "src/nemotron/steps/rl" not in includes
     assert "src/nemotron/recipes/nano3" not in includes
@@ -101,22 +100,77 @@ def test_auto_includes_scopes_to_active_step_subtree(tmp_path):
 def test_auto_includes_ships_active_step_ancestor_helpers(tmp_path):
     _write_fake_repo(tmp_path)
     steps = tmp_path / "src" / "nemotron" / "steps"
-    (steps / "prep").mkdir(parents=True)
-    (steps / "prep" / "__init__.py").write_text("")
-    (steps / "prep" / "_common.py").write_text("")
-    (steps / "prep" / "sft_packing").mkdir()
-    (steps / "prep" / "sft_packing" / "step.py").write_text("")
+    (steps / "data_prep").mkdir(parents=True)
+    (steps / "data_prep" / "__init__.py").write_text("")
+    (steps / "data_prep" / "_common.py").write_text("")
+    (steps / "data_prep" / "sft_packing").mkdir()
+    (steps / "data_prep" / "sft_packing" / "step.py").write_text("")
     (steps / "sft" / "automodel").mkdir(parents=True)
     (steps / "sft" / "automodel" / "step.py").write_text("")
 
-    includes = _auto_includes(tmp_path, script_path="src/nemotron/steps/prep/sft_packing/step.py")
+    includes = _auto_includes(tmp_path, script_path="src/nemotron/steps/data_prep/sft_packing/step.py")
 
-    assert "src/nemotron/steps/prep/__init__.py" in includes
-    assert "src/nemotron/steps/prep/_common.py" in includes
-    assert "src/nemotron/steps/prep/sft_packing" in includes
+    assert "src/nemotron/steps/data_prep" in includes
     assert "src/nemotron/steps/sft" not in includes
 
 
+def test_source_packager_ships_active_branch_support_paths(tmp_path):
+    _write_fake_repo(tmp_path)
+    steps = tmp_path / "src" / "nemotron" / "steps"
+    byob = steps / "byob"
+    (byob / "mcq").mkdir(parents=True)
+    (byob / "legacy").mkdir()
+    (byob / "__init__.py").write_text("from nemotron.steps.byob.adapter import x\n")
+    (byob / "adapter.py").write_text("x = 1\n")
+    (byob / "scripts").mkdir()
+    (byob / "scripts" / "run.py").write_text("")
+    (byob / "runtime").mkdir()
+    (byob / "runtime" / "config.py").write_text("")
+    (byob / "assets").mkdir()
+    (byob / "assets" / "tiny.txt").write_text("")
+    (byob / "mcq" / "step.py").write_text("")
+    (byob / "legacy" / "step.py").write_text("")
+    (steps / "sft" / "automodel").mkdir(parents=True)
+    (steps / "sft" / "automodel" / "step.py").write_text("")
+
+    pkg = SourcePackager(
+        repo_root=str(tmp_path),
+        script_path="src/nemotron/steps/byob/mcq/step.py",
+    )
+    out = pkg.package(None, str(tmp_path), "test")
+    with tarfile.open(out) as tf:
+        names = set(tf.getnames())
+
+    assert "src/nemotron/steps/byob/__init__.py" in names
+    assert "src/nemotron/steps/byob/adapter.py" in names
+    assert "src/nemotron/steps/byob/scripts/run.py" in names
+    assert "src/nemotron/steps/byob/runtime/config.py" in names
+    assert "src/nemotron/steps/byob/assets/tiny.txt" in names
+    assert "src/nemotron/steps/byob/mcq/step.py" in names
+    assert "src/nemotron/steps/sft/automodel/step.py" not in names
+
+
+def test_auto_includes_scopes_active_script_collection_without_step_names(tmp_path):
+    _write_fake_repo(tmp_path)
+    flows = tmp_path / "src" / "nemotron" / "workflows"
+    benchmark = flows / "benchmark"
+    (benchmark / "mcq").mkdir(parents=True)
+    (benchmark / "legacy").mkdir()
+    (flows / "other").mkdir()
+    (benchmark / "__init__.py").write_text("")
+    (benchmark / "adapter.py").write_text("x = 1\n")
+    (benchmark / "assets").mkdir()
+    (benchmark / "assets" / "tiny.txt").write_text("")
+    (benchmark / "mcq" / "run.py").write_text("")
+    (benchmark / "legacy" / "run.py").write_text("")
+    (flows / "other" / "run.py").write_text("")
+
+    includes = _auto_includes(tmp_path, script_path="src/nemotron/workflows/benchmark/mcq/run.py")
+
+    assert "src/nemotron/workflows/benchmark" in includes
+    assert "src/nemotron/workflows/other" not in includes
+
+
 def test_auto_includes_raises_when_src_missing(tmp_path):
     with pytest.raises(ValueError, match="No src/"):
         _auto_includes(tmp_path, script_path=None)
@@ -127,6 +181,20 @@ def test_auto_includes_raises_when_src_missing(tmp_path):
 
 def test_source_packager_filters_pycache_and_pyc(tmp_path):
     _write_fake_repo(tmp_path)
+    artifacts = tmp_path / "src" / "nemotron" / "kit" / "artifacts"
+    artifacts.mkdir()
+    for name in (
+        "records.parquet",
+        "table.arrow",
+        "weights.safetensors",
+        "checkpoint.ckpt",
+        "optimizer.pt",
+        "tensor.npy",
+        "index.idx",
+        "model.onnx",
+        "model.h5",
+    ):
+        (artifacts / name).write_text("large artifact")
     pkg = SourcePackager(
         repo_root=str(tmp_path),
         script_path="src/nemotron/recipes/nano3/x.py",
@@ -138,10 +206,26 @@ def test_source_packager_filters_pycache_and_pyc(tmp_path):
     # Pyc + __pycache__ stripped.
     assert not any(n.endswith(".pyc") for n in names)
     assert not any("__pycache__" in n for n in names)
+    assert not any("/artifacts/" in n for n in names)
     # Real package files present.
     assert any(n.endswith("src/nemo_runspec/__init__.py") for n in names)
 
 
+def test_source_packager_warns_when_tarball_exceeds_limit(tmp_path, monkeypatch, capsys):
+    _write_fake_repo(tmp_path)
+    monkeypatch.setenv("NEMOTRON_SRC_TARBALL_WARN_BYTES", "1")
+    pkg = SourcePackager(
+        repo_root=str(tmp_path),
+        script_path="src/nemotron/recipes/nano3/x.py",
+    )
+
+    pkg.package(None, str(tmp_path), "test")
+
+    captured = capsys.readouterr()
+    assert "source tarball is" in captured.err
+    assert "NEMOTRON_SRC_TARBALL_WARN_BYTES=0" in captured.err
+
+
 # ── plan_for ─────────────────────────────────────────────────────────────────
 
 
@@ -176,7 +260,7 @@ def test_plan_for_lepton_chunks_source_into_env_vars(tmp_path, monkeypatch):
     assert ".nemotron-src-failed" in script
     assert 'while [ "$i" -lt 600 ]' in script
     assert "timed out waiting for" in script
-    assert not plan.needs_pwd_symlinks
+    assert plan.needs_pwd_symlinks is True
 
 
 def test_plan_for_cloud_ready_marker_is_unique_per_submission(tmp_path, monkeypatch):
@@ -234,7 +318,7 @@ def test_plan_for_dgxcloud_chunks_source_into_env_vars(tmp_path, monkeypatch):
     # Env vars populated, no file-based PVC path.
     assert int(env_vars["_NEMOTRON_SRC_CHUNKS"]) >= 1
     assert "_NEMOTRON_SRC_CHUNK_0" in env_vars
-    assert not plan.needs_pwd_symlinks
+    assert plan.needs_pwd_symlinks is True
 
 
 def test_plan_for_fallback_uses_native_packager_path(tmp_path, monkeypatch):
diff --git a/tests/nemo_runspec/test_execution.py b/tests/nemo_runspec/test_execution.py
index a0d3249c5..42f37ec10 100644
--- a/tests/nemo_runspec/test_execution.py
+++ b/tests/nemo_runspec/test_execution.py
@@ -33,11 +33,13 @@
 from omegaconf import OmegaConf
 
 from nemo_runspec.execution import (
+    _cloud_config_path,
+    _cloud_script_path,
     _derive_cloud_workspace,
     _get_env,
     _git_mount_commands,
     _parse_netrc,
-    _cloud_script_path,
+    _pwd_symlink_cmd,
     _ray_node_source_sync_cmd,
     _to_plain,
     _transport_env_cleanup_cmd,
@@ -132,22 +134,32 @@ class TestCloudScriptPath:
     def test_rewrites_src_relative_script_to_pod_local_source(self):
         assert (
             _cloud_script_path(
-                "src/nemotron/steps/prep/pretrain_prep/step.py",
+                "src/nemotron/steps/data_prep/pretrain_prep/step.py",
                 "/mnt/work/_nemotron/src-deadbeef-12345678",
             )
-            == "/nemo_run/code/src/nemotron/steps/prep/pretrain_prep/step.py"
+            == "/nemo_run/code/src/nemotron/steps/data_prep/pretrain_prep/step.py"
         )
 
     def test_keeps_script_path_for_native_source_layout(self):
         assert (
             _cloud_script_path(
-                "src/nemotron/steps/prep/pretrain_prep/step.py",
+                "src/nemotron/steps/data_prep/pretrain_prep/step.py",
                 "/nemo_run/code/src",
             )
-            == "src/nemotron/steps/prep/pretrain_prep/step.py"
+            == "src/nemotron/steps/data_prep/pretrain_prep/step.py"
         )
 
 
+class TestCloudConfigPath:
+    def test_config_path_is_unique_and_keeps_content_digest(self):
+        first = _cloud_config_path("/mnt/work/_nemotron", b"stage: all\n")
+        second = _cloud_config_path("/mnt/work/_nemotron", b"stage: all\n")
+
+        assert first != second
+        assert re.match(r"^/mnt/work/_nemotron/config-[0-9a-f]{16}-[0-9a-f]{8}\.yaml$", first)
+        assert first.rsplit("-", 1)[0] == second.rsplit("-", 1)[0]
+
+
 class TestRayNodeSourceSync:
     def test_sync_command_pins_extraction_once_per_ray_node(self):
         cmd = _ray_node_source_sync_cmd(
@@ -165,6 +177,19 @@ def test_sync_command_is_noop_without_marker(self):
         assert _ray_node_source_sync_cmd("/mnt/work/_nemotron/src", None) == "true"
 
 
+class TestPwdSymlinkCmd:
+    def test_removes_stale_compat_paths_before_linking(self):
+        cmd = _pwd_symlink_cmd("/mnt/work/_nemotron", "/mnt/work/_nemotron/src-abc")
+
+        assert "mkdir -p /mnt/work/_nemotron/src" in cmd
+        assert "rm -rf /mnt/work/_nemotron/src/nemotron /mnt/work/_nemotron/src/nemo_runspec" in cmd
+        assert "ln -sfn /mnt/work/_nemotron/src-abc/nemotron /mnt/work/_nemotron/src/nemotron" in cmd
+        assert (
+            "ln -sfn /mnt/work/_nemotron/src-abc/nemo_runspec "
+            "/mnt/work/_nemotron/src/nemo_runspec"
+        ) in cmd
+
+
 class TestRayJobStatusAndLogs:
     def test_wait_for_ray_job_accepts_dict_status(self):
         class FakeRayJob:
@@ -840,8 +865,8 @@ def test_auto_mount_string_filtered_out(self):
         executor = create_executor(env=env, env_vars={}, packager=_FakePackager())
         assert executor.mounts == [{"path": "/data", "mount_path": "/data"}]
 
-    def test_pre_launch_includes_auto_mount_commands(self):
-        """Auto_mount registered repos become pre_launch git clones."""
+    def test_pre_launch_keeps_user_commands_only(self):
+        """auto_mount repos are cloned by the inline launch script, not Lepton pre-launch."""
         env = _make_env(
             executor="lepton",
             container_image="img",
@@ -851,9 +876,10 @@ def test_pre_launch_includes_auto_mount_commands(self):
         mounts = {"r": {"url": "u", "ref": "main", "target": "/opt/r"}}
         with patch("nemo_runspec.config.resolvers.get_git_mounts", return_value=mounts):
             executor = create_executor(env=env, env_vars={}, packager=_FakePackager())
-        # user commands preserved + git clone appended
+        # User commands are preserved; auto_mount clone commands are injected
+        # later into the inline launch script where ordering is controlled.
         assert "echo hello" in executor.pre_launch_commands
-        assert any("clone" in c and "/opt/r" in c for c in executor.pre_launch_commands)
+        assert not any("clone" in c and "/opt/r" in c for c in executor.pre_launch_commands)
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/steps/byob/test_byob_config.py b/tests/steps/byob/test_byob_config.py
new file mode 100644
index 000000000..51385cc11
--- /dev/null
+++ b/tests/steps/byob/test_byob_config.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+
+from nemotron.steps.byob.runtime.config import ByobConfig
+
+
+def test_checked_in_tiny_config_validates(tmp_path: Path) -> None:
+    config_path = (
+        Path(__file__).resolve().parents[3] / "src" / "nemotron" / "steps" / "byob" / "mcq" / "config" / "tiny.yaml"
+    )
+    config_data = yaml.safe_load(config_path.read_text(encoding="utf-8"))
+    input_dir = tmp_path / "input"
+    (input_dir / "maths").mkdir(parents=True)
+    (input_dir / "maths" / "tiny.txt").write_text("tiny source document\n", encoding="utf-8")
+    config_data["input_dir"] = str(input_dir)
+    config_data["output_dir"] = str(tmp_path / "output")
+    temp_config = tmp_path / "tiny.yaml"
+    temp_config.write_text(yaml.safe_dump(config_data), encoding="utf-8")
+
+    config = ByobConfig.from_yaml(str(temp_config))
+
+    assert config.do_coverage_check is False
+    assert config.semantic_deduplication_config["enabled"] is False
+    assert config.semantic_outlier_detection_config["enabled"] is False
diff --git a/tests/steps/convert/__init__.py b/tests/steps/convert/__init__.py
new file mode 100644
index 000000000..341a77c5b
--- /dev/null
+++ b/tests/steps/convert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/steps/convert/test_convert_runners.py b/tests/steps/convert/test_convert_runners.py
new file mode 100644
index 000000000..59d540bcd
--- /dev/null
+++ b/tests/steps/convert/test_convert_runners.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from types import ModuleType
+
+import pytest
+from omegaconf import OmegaConf
+
+from nemotron.steps._runners import convert
+
+
+def _install_fake_megatron_bridge(monkeypatch: pytest.MonkeyPatch, auto_bridge: type) -> None:
+    megatron_mod = ModuleType("megatron")
+    megatron_mod.__path__ = []
+    bridge_mod = ModuleType("megatron.bridge")
+    bridge_mod.AutoBridge = auto_bridge
+    monkeypatch.setitem(sys.modules, "megatron", megatron_mod)
+    monkeypatch.setitem(sys.modules, "megatron.bridge", bridge_mod)
+
+
+def test_convert_steps_have_default_configs(steps_root: Path) -> None:
+    for step_id in ("hf_to_megatron", "megatron_to_hf", "merge_lora"):
+        config_path = steps_root / "convert" / step_id / "config" / "default.yaml"
+        assert config_path.exists(), f"{config_path} is required for nemotron steps run --dry-run"
+        assert isinstance(OmegaConf.to_container(OmegaConf.load(config_path), resolve=False), dict)
+
+
+def test_hf_to_megatron_forwards_autobridge_args(monkeypatch: pytest.MonkeyPatch) -> None:
+    calls: list[dict] = []
+
+    class FakeAutoBridge:
+        @staticmethod
+        def import_ckpt(**kwargs):
+            calls.append(kwargs)
+
+    _install_fake_megatron_bridge(monkeypatch, FakeAutoBridge)
+
+    convert.import_hf_to_megatron(
+        {
+            "hf_model_id": "hf-source",
+            "megatron_path": "/tmp/megatron",
+            "device_map": "auto",
+            "trust_remote_code": True,
+        }
+    )
+
+    assert calls == [
+        {
+            "hf_model_id": "hf-source",
+            "megatron_path": "/tmp/megatron",
+            "device_map": "auto",
+            "trust_remote_code": True,
+        }
+    ]
+
+
+def test_hf_to_megatron_prefers_torch_dtype_over_deprecated_alias(monkeypatch: pytest.MonkeyPatch) -> None:
+    calls: list[dict] = []
+    sentinel_dtype = object()
+
+    class FakeAutoBridge:
+        @staticmethod
+        def import_ckpt(**kwargs):
+            calls.append(kwargs)
+
+    _install_fake_megatron_bridge(monkeypatch, FakeAutoBridge)
+    monkeypatch.setattr(convert, "_torch_dtype", lambda name: sentinel_dtype if name == "float16" else name)
+
+    convert.import_hf_to_megatron(
+        {
+            "hf_model_id": "hf-source",
+            "megatron_path": "/tmp/megatron",
+            "dtype": "bfloat16",
+            "torch_dtype": "float16",
+        }
+    )
+
+    assert calls[0]["torch_dtype"] is sentinel_dtype
+
+
+def test_megatron_to_hf_prefers_hf_pretrained_export(monkeypatch: pytest.MonkeyPatch) -> None:
+    from_hf_pretrained_calls: list[tuple] = []
+    export_calls: list[dict] = []
+
+    class FakeBridge:
+        def export_ckpt(self, **kwargs):
+            export_calls.append(kwargs)
+
+    class FakeAutoBridge:
+        @staticmethod
+        def from_hf_pretrained(*args, **kwargs):
+            from_hf_pretrained_calls.append((args, kwargs))
+            return FakeBridge()
+
+    _install_fake_megatron_bridge(monkeypatch, FakeAutoBridge)
+
+    convert.export_megatron_to_hf(
+        megatron_path="/tmp/megatron/iter_0000001",
+        hf_model_id="hf-config",
+        hf_path="/tmp/hf",
+        trust_remote_code=True,
+        show_progress=False,
+        strict=False,
+    )
+
+    assert from_hf_pretrained_calls == [(("hf-config",), {"trust_remote_code": True})]
+    assert export_calls == [
+        {
+            "megatron_path": "/tmp/megatron/iter_0000001",
+            "hf_path": "/tmp/hf",
+            "show_progress": False,
+            "strict": False,
+        }
+    ]
+
+
+def test_megatron_to_hf_falls_back_to_auto_config_export(monkeypatch: pytest.MonkeyPatch) -> None:
+    from_auto_config_calls: list[tuple] = []
+
+    class FakeBridge:
+        def export_ckpt(self, **_kwargs):
+            pass
+
+    class FakeAutoBridge:
+        @staticmethod
+        def from_auto_config(*args, **kwargs):
+            from_auto_config_calls.append((args, kwargs))
+            return FakeBridge()
+
+    _install_fake_megatron_bridge(monkeypatch, FakeAutoBridge)
+
+    convert.export_megatron_to_hf(
+        megatron_path="/tmp/megatron/iter_0000001",
+        hf_model_id="hf-config",
+        hf_path="/tmp/hf",
+        trust_remote_code=True,
+    )
+
+    assert from_auto_config_calls == [(("/tmp/megatron/iter_0000001", "hf-config"), {"trust_remote_code": True})]
+
+
+def test_megatron_lora_merge_command_uses_cpu_script_by_default() -> None:
+    cmd = convert.build_megatron_lora_merge_command(
+        {
+            "upstream_script": "/opt/Megatron-Bridge/examples/peft/merge_lora.py",
+            "lora_checkpoint": "/tmp/lora",
+            "hf_model_id": "hf-config",
+            "base_megatron_path": "/tmp/base-megatron",
+            "cpu": True,
+            "tp": 1,
+            "pp": 1,
+            "ep": 1,
+        },
+        merged_megatron_path="/tmp/merged-megatron",
+    )
+
+    assert cmd[:2] == [sys.executable, "/opt/Megatron-Bridge/examples/peft/merge_lora.py"]
+    assert "--lora-checkpoint" in cmd
+    assert "/tmp/lora" in cmd
+    assert "--pretrained" in cmd
+    assert "/tmp/base-megatron" in cmd
+    assert "--cpu" in cmd
+
+
+def test_merge_backend_auto_uses_base_path_shape() -> None:
+    assert convert._resolve_merge_backend({"backend": "auto", "base_hf_path": "/tmp/base-hf"}) == "hf_peft"
+    assert (
+        convert._resolve_merge_backend({"backend": "auto", "base_megatron_path": "/tmp/base-megatron"})
+        == "megatron_bridge"
+    )
+
+
+@pytest.mark.parametrize("backend", ["hf", "peft", "megatron", "mbridge"])
+def test_merge_backend_hidden_aliases_are_rejected(monkeypatch: pytest.MonkeyPatch, backend: str) -> None:
+    monkeypatch.setattr(convert, "load_convert_config", lambda _default_config: {"backend": backend})
+
+    with pytest.raises(ValueError, match="auto, hf_peft, megatron_bridge"):
+        convert.run_merge_lora(Path("unused.yaml"))
+
+
+def test_hf_peft_adapter_path_resolves_nested_latest_checkpoint(tmp_path: Path) -> None:
+    old_adapter = tmp_path / "global_step5" / "model"
+    new_adapter = tmp_path / "global_step10" / "model"
+    old_adapter.mkdir(parents=True)
+    new_adapter.mkdir(parents=True)
+    (old_adapter / "adapter_config.json").write_text("{}", encoding="utf-8")
+    (new_adapter / "adapter_config.json").write_text("{}", encoding="utf-8")
+
+    assert convert._resolve_hf_peft_adapter_path(str(tmp_path)) == str(new_adapter)
+
+
+def test_hf_peft_adapter_path_error_points_to_adapter_config(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="adapter_config.json"):
+        convert._resolve_hf_peft_adapter_path(str(tmp_path))
+
+
+def test_missing_required_config_value_is_clear() -> None:
+    with pytest.raises(ValueError, match="hf_model_id"):
+        convert.import_hf_to_megatron({"megatron_path": "/tmp/megatron"})
diff --git a/tests/steps/prep/__init__.py b/tests/steps/data_prep/__init__.py
similarity index 100%
rename from tests/steps/prep/__init__.py
rename to tests/steps/data_prep/__init__.py
diff --git a/tests/steps/prep/test_pretrain_prep.py b/tests/steps/data_prep/test_pretrain_prep.py
similarity index 70%
rename from tests/steps/prep/test_pretrain_prep.py
rename to tests/steps/data_prep/test_pretrain_prep.py
index 772bdcc95..6f7f1049f 100644
--- a/tests/steps/prep/test_pretrain_prep.py
+++ b/tests/steps/data_prep/test_pretrain_prep.py
@@ -3,15 +3,15 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 
-"""Static checks for ``steps/prep/pretrain_prep``."""
+"""Static checks for ``steps/data_prep/pretrain_prep``."""
 
 from .._step_helpers import assert_step_static, step_dir
 
 
 def test_pretrain_prep_static() -> None:
     assert_step_static(
-        step_dir(__file__, "prep", "pretrain_prep"),
-        expected_name="steps/prep/pretrain_prep",
+        step_dir(__file__, "data_prep", "pretrain_prep"),
+        expected_name="steps/data_prep/pretrain_prep",
         expected_launch="python",
         expected_default_config="default",
     )
diff --git a/tests/steps/prep/test_rl_prep.py b/tests/steps/data_prep/test_rl_prep.py
similarity index 72%
rename from tests/steps/prep/test_rl_prep.py
rename to tests/steps/data_prep/test_rl_prep.py
index 706f753f0..44bef254c 100644
--- a/tests/steps/prep/test_rl_prep.py
+++ b/tests/steps/data_prep/test_rl_prep.py
@@ -3,15 +3,15 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 
-"""Static checks for ``steps/prep/rl_prep``."""
+"""Static checks for ``steps/data_prep/rl_prep``."""
 
 from .._step_helpers import assert_step_static, step_dir
 
 
 def test_rl_prep_static() -> None:
     assert_step_static(
-        step_dir(__file__, "prep", "rl_prep"),
-        expected_name="steps/prep/rl_prep",
+        step_dir(__file__, "data_prep", "rl_prep"),
+        expected_name="steps/data_prep/rl_prep",
         expected_launch="python",
         expected_default_config="default",
     )
diff --git a/tests/steps/prep/test_sft_packing.py b/tests/steps/data_prep/test_sft_packing.py
similarity index 77%
rename from tests/steps/prep/test_sft_packing.py
rename to tests/steps/data_prep/test_sft_packing.py
index 124e1a031..0ba17789f 100644
--- a/tests/steps/prep/test_sft_packing.py
+++ b/tests/steps/data_prep/test_sft_packing.py
@@ -3,7 +3,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 
-"""Static checks for ``steps/prep/sft_packing``.
+"""Static checks for ``steps/data_prep/sft_packing``.
 
 This step pre-existed the generic CLI work but is important for the agentic
 pipeline (it produces ``packed_parquet`` consumed by sft/megatron_bridge).
@@ -14,8 +14,8 @@
 
 def test_sft_packing_static() -> None:
     assert_step_static(
-        step_dir(__file__, "prep", "sft_packing"),
-        expected_name="steps/prep/sft_packing",
+        step_dir(__file__, "data_prep", "sft_packing"),
+        expected_name="steps/data_prep/sft_packing",
         expected_launch="python",
         expected_default_config="default",
     )
diff --git a/tests/steps/sdg/test_data_designer.py b/tests/steps/sdg/test_data_designer.py
index aebf791c2..77adfa055 100644
--- a/tests/steps/sdg/test_data_designer.py
+++ b/tests/steps/sdg/test_data_designer.py
@@ -18,6 +18,7 @@
 import yaml
 
 from nemotron.steps.sdg.data_designer.step import (
+    build_model_providers,
     parse_json_object,
     project_records,
     records_from_designer_result,
@@ -27,6 +28,7 @@
 
 VALID_COLUMN_TYPES = {"category", "seed", "llm_text", "llm_structured", "llm_judge"}
 LLM_COLUMN_TYPES = {"llm_text", "llm_structured", "llm_judge"}
+BUILTIN_PROVIDER_NAMES = {"anthropic", "nvidia", "openai", "openrouter"}
 
 STEP = step_dir(__file__, "sdg", "data_designer")
 REPO_ROOT = STEP.parents[4]
@@ -121,6 +123,86 @@ def test_llm_columns_reference_declared_model_aliases() -> None:
                 assert alias in aliases, f"{path.name}: column {col['name']!r} references unknown model {alias!r}"
 
 
+def test_custom_providers_are_well_formed() -> None:
+    for path in _config_paths():
+        cfg = _load_config(path)
+        providers = cfg.get("providers") or []
+        assert isinstance(providers, list), f"{path.name}: providers must be a list"
+
+        names = []
+        for provider in providers:
+            assert isinstance(provider, dict), f"{path.name}: providers entries must be mappings"
+            assert provider.get("name"), f"{path.name}: providers entries require name"
+            assert provider.get("endpoint"), f"{path.name}: provider {provider.get('name')!r} requires endpoint"
+            provider_type = provider.get("provider_type", "openai")
+            assert provider_type in {"anthropic", "openai"}, (
+                f"{path.name}: provider {provider['name']!r} has unsupported provider_type {provider_type!r}"
+            )
+            api_key = provider.get("api_key")
+            assert not (isinstance(api_key, str) and api_key.startswith("${oc.env:")), (
+                f"{path.name}: provider {provider['name']!r} should reference the API key env var name, "
+                "not resolve the secret through OmegaConf"
+            )
+            names.append(provider["name"])
+
+        assert len(names) == len(set(names)), f"{path.name}: provider names must be unique"
+
+
+def test_model_providers_reference_declared_or_builtin_providers() -> None:
+    for path in _config_paths():
+        cfg = _load_config(path)
+        declared_providers = {provider["name"] for provider in cfg.get("providers") or []}
+        for model in cfg.get("models") or []:
+            provider = model.get("provider")
+            if declared_providers:
+                assert provider, f"{path.name}: models[].provider is required when custom providers are declared"
+            if provider:
+                assert provider in declared_providers | BUILTIN_PROVIDER_NAMES, (
+                    f"{path.name}: model {model['alias']!r} references unknown provider {provider!r}"
+                )
+
+
+def test_build_model_providers_from_config() -> None:
+    class FakeProvider:
+        def __init__(self, **kwargs):
+            self.kwargs = kwargs
+
+    class FakeDD:
+        ModelProvider = FakeProvider
+
+    providers = build_model_providers(
+        {
+            "providers": [
+                {
+                    "name": "my-provider",
+                    "endpoint": "https://example.test/v1",
+                    "provider_type": "openai",
+                    "api_key": "OPENAI_API_KEY",
+                    "extra_body": {"foo": "bar"},
+                    "extra_headers": {"X-Test": "1"},
+                },
+                {
+                    "name": "no-auth-provider",
+                    "endpoint": "http://localhost:8000/v1",
+                    "api_key": "",
+                },
+            ]
+        },
+        FakeDD,
+    )
+
+    assert providers is not None
+    assert providers[0].kwargs == {
+        "name": "my-provider",
+        "endpoint": "https://example.test/v1",
+        "provider_type": "openai",
+        "api_key": "OPENAI_API_KEY",
+        "extra_body": {"foo": "bar"},
+        "extra_headers": {"X-Test": "1"},
+    }
+    assert providers[1].kwargs["api_key"] is None
+
+
 def test_structured_llm_columns_have_output_format() -> None:
     for path in _config_paths():
         for col in _load_columns(path):
diff --git a/tests/steps/test_cloud_backend.py b/tests/steps/test_cloud_backend.py
index ba1fad64f..576ec7588 100644
--- a/tests/steps/test_cloud_backend.py
+++ b/tests/steps/test_cloud_backend.py
@@ -19,9 +19,9 @@
 
 import pytest
 
-import nemotron.cli.commands.step.backends.cloud as cloud_mod
-from nemotron.cli.commands.step.backends.base import JobContext
-from nemotron.cli.commands.step.backends.cloud import CloudBackend
+import nemotron.cli.commands.steps.backends.cloud as cloud_mod
+from nemotron.cli.commands.steps.backends.base import JobContext
+from nemotron.cli.commands.steps.backends.cloud import CloudBackend
 
 
 def _ctx(step_id: str, *, launch: str = "ray") -> JobContext:
@@ -53,12 +53,12 @@ def fake_execute_cloud(*_args, **kwargs):
         calls.append(("inline", kwargs))
 
     def fail_execute_cloud_ray(*_args, **_kwargs):
-        raise AssertionError("prep steps must not use cloud RayCluster submission")
+        raise AssertionError("data_prep steps must not use cloud RayCluster submission")
 
     monkeypatch.setattr(cloud_mod, "execute_cloud", fake_execute_cloud)
     monkeypatch.setattr(cloud_mod, "execute_cloud_ray", fail_execute_cloud_ray)
 
-    CloudBackend().submit(_ctx("prep/sft_packing"))
+    CloudBackend().submit(_ctx("data_prep/sft_packing"))
 
     assert len(calls) == 1
     assert calls[0][0] == "inline"
diff --git a/tests/steps/test_curator_runtime_bootstrap.py b/tests/steps/test_curator_runtime_bootstrap.py
new file mode 100644
index 000000000..34fb3a2f2
--- /dev/null
+++ b/tests/steps/test_curator_runtime_bootstrap.py
@@ -0,0 +1,310 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pytest
+
+from nemotron.steps._bootstrap import curator_runtime, runtime_payloads
+
+
+def _write_pyproject(root: Path) -> Path:
+    pyproject = root / "pyproject.toml"
+    pyproject.write_text(
+        """
+[project]
+name = "demo"
+version = "0"
+
+[project.optional-dependencies]
+byob = [
+    "data-designer==0.5.5",
+    "nemo-curator[translation_all] @ git+https://example.invalid/Curator.git",
+]
+byob-gpu = [
+    "cupy-cuda12x==14.0.1",
+]
+translate = [
+    "nemo-curator[translation_all] @ git+https://example.invalid/Curator.git",
+    "sacrebleu==2.6.0",
+]
+curate = [
+    "nemo-curator @ git+https://example.invalid/Curator.git",
+    "pyyaml==6.0.2",
+]
+
+[tool.uv]
+constraint-dependencies = ["transformers>=4.56.0,<5.0"]
+override-dependencies = ["torch==2.10.0"]
+
+[tool.nemotron.runtime.byob]
+extras = ["byob"]
+venv-name = "byob"
+extra-index-urls = ["https://pypi.nvidia.com"]
+omit-packages = ["nemo-curator"]
+required-imports = ["data_designer"]
+
+[tool.nemotron.runtime.byob-gpu]
+extras = ["byob", "byob-gpu"]
+venv-name = "byob-gpu"
+extra-index-urls = ["https://pypi.nvidia.com"]
+torch-backend = "cu128"
+omit-packages = ["nemo-curator"]
+required-imports = ["data_designer"]
+spec-only-imports = ["cupy"]
+
+[tool.nemotron.runtime.translate]
+extras = ["translate"]
+venv-name = "translate"
+extra-index-urls = ["https://pypi.nvidia.com"]
+omit-packages = ["nemo-curator"]
+required-imports = ["nemo_curator", "yaml"]
+
+[tool.nemotron.runtime.curate]
+extras = ["curate"]
+venv-name = "curate"
+extra-index-urls = ["https://pypi.nvidia.com"]
+omit-packages = ["nemo-curator"]
+required-imports = ["huggingface_hub", "nemo_curator", "yaml"]
+""".lstrip(),
+        encoding="utf-8",
+    )
+    (root / "uv.lock").write_text("# fake lock\n", encoding="utf-8")
+    return pyproject
+
+
+def _write_runtime_manifest(root: Path) -> Path:
+    runtime_dir = root / ".nemotron_runtime"
+    runtime_dir.mkdir()
+    (runtime_dir / "byob.requirements.txt").write_text(
+        "data-designer==0.5.5\ntransitive-curator-dependency==1.0.0\n",
+        encoding="utf-8",
+    )
+    (runtime_dir / "byob.constraints.txt").write_text("transformers>=4.56.0,<5.0\n", encoding="utf-8")
+    (runtime_dir / "byob.overrides.txt").write_text("huggingface-hub>=0.34,<1.0\n", encoding="utf-8")
+    manifest = {
+        "version": 1,
+        "profiles": {
+            name: {
+                "name": name,
+                "venv_name": name,
+                "extras": ["byob"],
+                "requirements": "byob.requirements.txt",
+                "constraints": "byob.constraints.txt",
+                "overrides": "byob.overrides.txt",
+                "extra_index_urls": ["https://pypi.nvidia.com"],
+                "torch_backend": "cu128",
+                "required_modules": ["data_designer"] if name == "byob" else ["nemo_curator"],
+                "spec_only_modules": ["cupy"] if name == "byob" else [],
+                "digest": "abc123",
+            }
+            for name in ("byob", "byob-gpu", "translate", "curate")
+        },
+    }
+    (runtime_dir / "runtime.json").write_text(json.dumps(manifest), encoding="utf-8")
+    return runtime_dir
+
+
+def test_runtime_manifest_drives_profile_without_pyproject(tmp_path: Path) -> None:
+    runtime_dir = _write_runtime_manifest(tmp_path)
+    metadata = curator_runtime._find_project_metadata(tmp_path)  # noqa: SLF001
+
+    spec = curator_runtime.load_runtime_spec("byob", metadata)
+    paths = curator_runtime._build_requirement_files(metadata, spec, tmp_path)  # noqa: SLF001
+
+    assert metadata.root == runtime_dir
+    assert spec.name == "byob"
+    assert spec.requirements_file == runtime_dir / "byob.requirements.txt"
+    assert paths["requirements"] == runtime_dir / "byob.requirements.txt"
+    assert paths["constraints"] == runtime_dir / "byob.constraints.txt"
+    assert paths["overrides"] == runtime_dir / "byob.overrides.txt"
+
+
+def test_unknown_runtime_profile_fails_from_pyproject(tmp_path: Path) -> None:
+    _write_pyproject(tmp_path)
+    metadata = curator_runtime._find_project_metadata(tmp_path)  # noqa: SLF001
+
+    with pytest.raises(ValueError, match="Runtime profile 'translation' is not defined"):
+        curator_runtime.load_runtime_spec("translation", metadata)
+
+
+def test_named_curator_runtime_profiles_from_pyproject(tmp_path: Path) -> None:
+    _write_pyproject(tmp_path)
+    metadata = curator_runtime._find_project_metadata(tmp_path)  # noqa: SLF001
+
+    profiles = {
+        name: curator_runtime.load_runtime_spec(name, metadata)
+        for name in ("byob", "byob-gpu", "translate", "curate")
+    }
+
+    assert set(profiles) == {"byob", "byob-gpu", "translate", "curate"}
+    assert profiles["byob-gpu"].venv_name == "byob-gpu"
+    assert profiles["translate"].venv_name == "translate"
+    assert profiles["curate"].venv_name == "curate"
+    assert profiles["byob"].extras == ("byob",)
+    assert profiles["byob-gpu"].extras == ("byob", "byob-gpu")
+    assert profiles["translate"].extras == ("translate",)
+    assert profiles["curate"].extras == ("curate",)
+
+
+def test_build_requirement_files_from_pyproject_extra(tmp_path: Path) -> None:
+    _write_pyproject(tmp_path)
+    metadata = curator_runtime._find_project_metadata(tmp_path)  # noqa: SLF001
+    spec = curator_runtime.load_runtime_spec("byob", metadata)
+
+    work_dir = tmp_path / "out"
+    work_dir.mkdir()
+    paths = curator_runtime._build_requirement_files(metadata, spec, work_dir)  # noqa: SLF001
+
+    requirements = paths["requirements"].read_text(encoding="utf-8")
+    constraints = paths["constraints"].read_text(encoding="utf-8")
+    overrides = paths["overrides"].read_text(encoding="utf-8")
+
+    assert "data-designer==0.5.5" in requirements
+    assert "cupy-cuda12x==14.0.1" not in requirements
+    assert "nemo-curator" not in requirements
+    assert "transformers>=4.56.0,<5.0" in constraints
+    assert "torch==2.10.0" in overrides
+
+    gpu_spec = curator_runtime.load_runtime_spec("byob-gpu", metadata)
+    gpu_paths = curator_runtime._build_requirement_files(metadata, gpu_spec, work_dir)  # noqa: SLF001
+    gpu_requirements = gpu_paths["requirements"].read_text(encoding="utf-8")
+    assert "data-designer==0.5.5" in gpu_requirements
+    assert "cupy-cuda12x==14.0.1" in gpu_requirements
+
+
+def test_runtime_payloads_ship_uv_constraints_and_overrides(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    _write_pyproject(tmp_path)
+    monkeypatch.setattr(runtime_payloads.shutil, "which", lambda _: None)
+    output_dir = tmp_path / "runtime"
+
+    runtime_payloads.write_runtime_payloads(tmp_path, output_dir=output_dir)
+
+    manifest = json.loads((output_dir / "runtime.json").read_text(encoding="utf-8"))
+    byob = manifest["profiles"]["byob"]
+
+    assert set(manifest["profiles"]) == {"byob", "byob-gpu", "curate", "translate"}
+    assert byob["requirements"] == "byob.requirements.txt"
+    assert byob["constraints"] == "byob.constraints.txt"
+    assert byob["overrides"] == "byob.overrides.txt"
+    assert manifest["profiles"]["byob-gpu"]["requirements"] == "byob-gpu.requirements.txt"
+    assert manifest["profiles"]["translate"]["requirements"] == "translate.requirements.txt"
+    assert manifest["profiles"]["curate"]["requirements"] == "curate.requirements.txt"
+    assert (output_dir / "byob.constraints.txt").read_text(encoding="utf-8") == "transformers>=4.56.0,<5.0\n"
+    assert (output_dir / "byob.overrides.txt").read_text(encoding="utf-8") == "torch==2.10.0\n"
+
+
+def test_runtime_payload_build_fails_when_uv_export_fails(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    _write_pyproject(tmp_path)
+    monkeypatch.setattr(runtime_payloads.shutil, "which", lambda _: "/bin/uv")
+
+    class FailedExport:
+        returncode = 1
+        stdout = ""
+        stderr = "lockfile is stale"
+
+    monkeypatch.setattr(runtime_payloads.subprocess, "run", lambda *args, **kwargs: FailedExport())
+
+    with pytest.raises(RuntimeError, match="uv export failed"):
+        runtime_payloads.build_runtime_payloads(tmp_path)
+
+
+def test_runtime_payload_env_drives_manifest(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    runtime_dir = _write_runtime_manifest(tmp_path)
+    payloads = [(path.name, path.read_bytes()) for path in sorted(runtime_dir.iterdir())]
+    env_vars = runtime_payloads.encode_runtime_payload_env(payloads, chunk_size=32)
+
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+    monkeypatch.setenv("NEMOTRON_CURATOR_METADATA_ROOT", str(tmp_path / "metadata"))
+    monkeypatch.setattr(curator_runtime, "DEFAULT_METADATA_ROOT", tmp_path / "metadata")
+
+    metadata = curator_runtime._find_project_metadata()  # noqa: SLF001
+    spec = curator_runtime.load_runtime_spec("byob", metadata)
+    paths = curator_runtime._build_requirement_files(metadata, spec, tmp_path)  # noqa: SLF001
+
+    assert metadata.root == tmp_path / "metadata" / env_vars[runtime_payloads.RUNTIME_PAYLOAD_SHA256_ENV][:16]
+    assert spec.name == "byob"
+    assert paths["requirements"].name == "byob.requirements.txt"
+    assert paths["constraints"].name == "byob.constraints.txt"
+    assert paths["overrides"].name == "byob.overrides.txt"
+
+
+def test_runtime_payload_env_reports_missing_chunks(tmp_path: Path) -> None:
+    runtime_dir = _write_runtime_manifest(tmp_path)
+    payloads = [(path.name, path.read_bytes()) for path in sorted(runtime_dir.iterdir())]
+    env_vars = runtime_payloads.encode_runtime_payload_env(payloads, chunk_size=32)
+    missing_key = f"{runtime_payloads.RUNTIME_PAYLOAD_CHUNK_PREFIX}1"
+    env_vars.pop(missing_key)
+
+    with pytest.raises(RuntimeError, match="missing chunk index\\(es\\) 1"):
+        runtime_payloads.decode_runtime_payload_env(env_vars)
+
+
+def test_runtime_payload_env_reports_missing_chunk_count(tmp_path: Path) -> None:
+    runtime_dir = _write_runtime_manifest(tmp_path)
+    payloads = [(path.name, path.read_bytes()) for path in sorted(runtime_dir.iterdir())]
+    env_vars = runtime_payloads.encode_runtime_payload_env(payloads, chunk_size=32)
+    env_vars.pop(runtime_payloads.RUNTIME_PAYLOAD_CHUNKS_ENV)
+
+    with pytest.raises(RuntimeError, match=runtime_payloads.RUNTIME_PAYLOAD_CHUNKS_ENV):
+        runtime_payloads.decode_runtime_payload_env(env_vars)
+
+
+def test_locked_requirement_files_use_uv_export_and_filter_omits(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    _write_pyproject(tmp_path)
+    metadata = curator_runtime._find_project_metadata(tmp_path)  # noqa: SLF001
+    spec = curator_runtime.load_runtime_spec("byob", metadata)
+    calls = []
+
+    def fake_run_capture(argv, *, cwd, env):  # noqa: ANN001
+        calls.append((argv, cwd, env))
+        return "\n".join(
+            [
+                "data-designer==0.5.5",
+                "nemo-curator @ git+https://example.invalid/Curator.git",
+                "transitive-curator-dependency==1.0.0",
+            ]
+        )
+
+    monkeypatch.setattr(curator_runtime, "_run_capture", fake_run_capture)
+
+    work_dir = tmp_path / "locked"
+    work_dir.mkdir()
+    paths = curator_runtime._build_requirement_files(  # noqa: SLF001
+        metadata,
+        spec,
+        work_dir,
+        uv=Path("/venv/bin/uv"),
+        env={"PATH": "/venv/bin"},
+    )
+
+    requirements = paths["requirements"].read_text(encoding="utf-8")
+    assert "data-designer==0.5.5" in requirements
+    assert "transitive-curator-dependency==1.0.0" in requirements
+    assert "nemo-curator" not in requirements
+    assert paths["constraints"] is None
+    assert paths["overrides"] is None
+    assert calls[0][1] == metadata.root
+    assert "--extra" in calls[0][0]
+    assert "byob" in calls[0][0]
+
+
+def test_normalize_command_replaces_python_with_runtime_python(tmp_path: Path) -> None:
+    runtime_python = tmp_path / "venv" / "bin" / "python"
+
+    assert curator_runtime._normalize_command(  # noqa: SLF001
+        ["--", "python", "-m", "nemotron.steps.byob.mcq.step"],
+        runtime_python,
+    ) == [str(runtime_python), "-m", "nemotron.steps.byob.mcq.step"]
+
+
+def test_normalize_command_requires_payload(tmp_path: Path) -> None:
+    with pytest.raises(ValueError, match="missing command"):
+        curator_runtime._normalize_command(["--"], tmp_path / "python")  # noqa: SLF001
diff --git a/tests/steps/test_index.py b/tests/steps/test_index.py
index e82d636b3..141dcdefb 100644
--- a/tests/steps/test_index.py
+++ b/tests/steps/test_index.py
@@ -16,6 +16,8 @@
 
 from pathlib import Path
 
+from nemotron.steps.index import discover_steps
+
 
 def test_steps_index_exists(steps_root: Path) -> None:
     assert (steps_root / "STEPS.md").exists(), "src/nemotron/steps/STEPS.md does not exist"
@@ -35,3 +37,27 @@ def test_every_step_manifest_is_mentioned_in_steps_index(
         assert step_dir in steps_index, (
             f"{manifest_path}: step directory {step_dir!r} is not mentioned in STEPS.md"
         )
+
+
+def test_legacy_data_designer_namespace_is_not_discoverable(steps_root: Path) -> None:
+    discovered_step_ids = {step.id for step in discover_steps(steps_root)}
+    legacy_step_id = "/".join(("syn" + "th", "data_" + "designer"))
+
+    assert "sdg/data_designer" in discovered_step_ids
+    assert legacy_step_id not in discovered_step_ids
+    assert legacy_step_id not in (steps_root / "STEPS.md").read_text(encoding="utf-8")
+
+
+def test_discovered_steps_have_runners(steps_root: Path) -> None:
+    missing_runners = [step.id for step in discover_steps(steps_root) if not (step.path / "step.py").exists()]
+
+    assert not missing_runners, f"Discovered steps without step.py runners: {missing_runners}"
+
+
+def test_legacy_grpo_step_is_not_discoverable(steps_root: Path) -> None:
+    discovered_step_ids = {step.id for step in discover_steps(steps_root)}
+    legacy_step_id = "/".join(("rl", "nemo_rl_" + "grpo"))
+
+    assert "rl/nemo_rl/rlvr" in discovered_step_ids
+    assert legacy_step_id not in discovered_step_ids
+    assert legacy_step_id not in (steps_root / "STEPS.md").read_text(encoding="utf-8")
diff --git a/tests/steps/test_run_cmd_runtime_preflight.py b/tests/steps/test_run_cmd_runtime_preflight.py
new file mode 100644
index 000000000..99a0fe912
--- /dev/null
+++ b/tests/steps/test_run_cmd_runtime_preflight.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+import typer
+
+from nemotron.cli.commands.steps import run_cmd
+from nemotron.steps._bootstrap import runtime_payloads
+
+
+def _write_step_tree(root: Path) -> Path:
+    (root / "pyproject.toml").write_text("[project]\nname = 'demo'\n", encoding="utf-8")
+    script_path = root / "src" / "nemotron" / "steps" / "byob" / "mcq" / "step.py"
+    script_path.parent.mkdir(parents=True)
+    script_path.write_text("", encoding="utf-8")
+    return script_path
+
+
+CURATOR_RUN_COMMAND = (
+    "python -m nemotron.steps._bootstrap.curator_runtime --profile byob -- python -m custom.step"
+)
+
+
+def test_curator_runtime_env_vars_for_remote_curator_command(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    script_path = _write_step_tree(tmp_path)
+    build_calls: list[Path] = []
+    encode_calls = []
+
+    def fake_build_runtime_payloads(root: Path):
+        build_calls.append(root)
+        return [("runtime.json", b'{"version": 1, "profiles": {}}')]
+
+    def fake_encode_runtime_payload_env(payloads):  # noqa: ANN001
+        encode_calls.append(payloads)
+        return {"NEMOTRON_CURATOR_RUNTIME_CHUNKS": "1"}
+
+    monkeypatch.setattr(runtime_payloads, "build_runtime_payloads", fake_build_runtime_payloads)
+    monkeypatch.setattr(runtime_payloads, "encode_runtime_payload_env", fake_encode_runtime_payload_env)
+
+    env_vars = run_cmd._build_curator_runtime_env_vars(  # noqa: SLF001
+        script_path=script_path,
+        env={"run_command": CURATOR_RUN_COMMAND},
+        mode="run",
+    )
+
+    assert build_calls == [tmp_path]
+    assert encode_calls == [[("runtime.json", b'{"version": 1, "profiles": {}}')]]
+    assert env_vars == {"NEMOTRON_CURATOR_RUNTIME_CHUNKS": "1"}
+    assert not (tmp_path / "src" / "nemotron" / "steps" / "_bootstrap" / "runtime").exists()
+
+
+def test_curator_runtime_env_vars_skip_local(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    script_path = _write_step_tree(tmp_path)
+    calls = []
+
+    monkeypatch.setattr(runtime_payloads, "build_runtime_payloads", lambda *args, **kwargs: calls.append(args))
+
+    env_vars = run_cmd._build_curator_runtime_env_vars(  # noqa: SLF001
+        script_path=script_path,
+        env={"run_command": CURATOR_RUN_COMMAND},
+        mode="local",
+    )
+
+    assert calls == []
+    assert env_vars == {}
+
+
+def test_curator_runtime_env_vars_uses_packaged_runtime_when_not_source_checkout(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    script_path = tmp_path / "site-packages" / "nemotron" / "steps" / "byob" / "mcq" / "step.py"
+    script_path.parent.mkdir(parents=True)
+    script_path.write_text("", encoding="utf-8")
+    encode_calls = []
+
+    monkeypatch.setattr(
+        runtime_payloads,
+        "read_runtime_payloads",
+        lambda: [("runtime.json", b'{"version": 1, "profiles": {}}')],
+    )
+    monkeypatch.setattr(
+        runtime_payloads,
+        "encode_runtime_payload_env",
+        lambda payloads: encode_calls.append(payloads) or {"NEMOTRON_CURATOR_RUNTIME_CHUNKS": "1"},
+    )
+
+    env_vars = run_cmd._build_curator_runtime_env_vars(  # noqa: SLF001
+        script_path=script_path,
+        env={"run_command": CURATOR_RUN_COMMAND},
+        mode="run",
+    )
+
+    assert encode_calls == [[("runtime.json", b'{"version": 1, "profiles": {}}')]]
+    assert env_vars == {"NEMOTRON_CURATOR_RUNTIME_CHUNKS": "1"}
+
+
+def test_curator_runtime_env_vars_fail_fast_without_source_or_packaged_runtime(
+    monkeypatch,
+    tmp_path: Path,
+) -> None:
+    script_path = tmp_path / "site-packages" / "nemotron" / "steps" / "byob" / "mcq" / "step.py"
+    script_path.parent.mkdir(parents=True)
+    script_path.write_text("", encoding="utf-8")
+    monkeypatch.setattr(runtime_payloads, "read_runtime_payloads", lambda: [])
+
+    with pytest.raises(typer.Exit) as exc_info:
+        run_cmd._build_curator_runtime_env_vars(  # noqa: SLF001
+            script_path=script_path,
+            env={"run_command": CURATOR_RUN_COMMAND},
+            mode="run",
+        )
+
+    assert exc_info.value.exit_code == 1
+
+
+def test_uses_curator_runtime_from_run_command() -> None:
+    assert run_cmd._uses_curator_runtime({"run_command": CURATOR_RUN_COMMAND})  # noqa: SLF001
diff --git a/tests/steps/test_slurm_backend_run_command.py b/tests/steps/test_slurm_backend_run_command.py
new file mode 100644
index 000000000..ab5f85e00
--- /dev/null
+++ b/tests/steps/test_slurm_backend_run_command.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from pathlib import Path
+from types import SimpleNamespace
+
+from nemotron.cli.commands.steps.backends import JobContext
+from nemotron.cli.commands.steps.backends.slurm import SlurmBackend
+
+
+def _ctx(*, env: dict[str, object], cmd: str | None = None) -> JobContext:
+    spec = SimpleNamespace(
+        run=SimpleNamespace(cmd=cmd, launch="python"),
+        image=None,
+        resources=None,
+    )
+    return JobContext(
+        step_id="byob/mcq",
+        script_path=Path("/repo/src/nemotron/steps/byob/mcq/step.py"),
+        train_path=Path("/repo/.nemotron/train.yaml"),
+        spec=spec,
+        env=env,
+        env_vars={},
+        passthrough=[],
+        startup_commands=[],
+        attached=True,
+        force_squash=False,
+    )
+
+
+def test_slurm_backend_honors_env_run_command() -> None:
+    command = (
+        "python -m nemotron.steps._bootstrap.curator_runtime --profile byob "
+        "-- python -m nemotron.steps.byob.mcq.step --config {config}"
+    )
+
+    assert SlurmBackend._build_cmd(_ctx(env={"run_command": command})) == (
+        "export PYTHONPATH=/nemo_run/code/src${PYTHONPATH:+:$PYTHONPATH}; "
+        "python -m nemotron.steps._bootstrap.curator_runtime --profile byob "
+        "-- python -m nemotron.steps.byob.mcq.step --config config.yaml"
+    )
+
+
+def test_slurm_backend_uses_code_packager_for_curator_runtime() -> None:
+    command = (
+        "python -m nemotron.steps._bootstrap.curator_runtime --profile byob "
+        "-- python -m nemotron.steps.byob.mcq.step --config {config}"
+    )
+
+    assert SlurmBackend._uses_code_packager(_ctx(env={"run_command": command}))
+
+
+def test_slurm_backend_prefers_runspec_cmd_over_env_run_command() -> None:
+    ctx = _ctx(
+        env={"run_command": "python -m env.wrapper --config {config}"},
+        cmd="python -m spec.wrapper --config {config}",
+    )
+
+    assert SlurmBackend._build_cmd(ctx) == "python -m spec.wrapper --config config.yaml"
diff --git a/tests/steps/test_translation_cli.py b/tests/steps/test_translation_cli.py
index 56b01d9a8..55999d54c 100644
--- a/tests/steps/test_translation_cli.py
+++ b/tests/steps/test_translation_cli.py
@@ -12,86 +12,86 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for the agentic translation CLI command."""
+"""Smoke tests for `nemotron steps` CLI surface for translation.
 
-from __future__ import annotations
+The bespoke ``nemotron steps translation`` recipe command and the top-level
+``nemotron byob`` command were removed in favour of the generic
+``nemotron steps run <id>`` dispatcher. These tests guard the new contract.
+"""
 
-from pathlib import Path
-from types import SimpleNamespace
-from unittest.mock import Mock
+from __future__ import annotations
 
-import pytest
-import typer
 from typer.testing import CliRunner
 
-import nemotron.cli.commands.steps.translation as translation_module
 from nemotron.cli.bin.nemotron import app
 
 
-def _fake_cfg(**overrides):
-    defaults = {
-        "mode": "local",
-        "passthrough": [],
-        "dry_run": False,
-        "ctx": SimpleNamespace(),
-    }
-    defaults.update(overrides)
-    return SimpleNamespace(**defaults)
-
-
-def test_root_cli_registers_steps_translation_command() -> None:
+def test_steps_help_lists_catalog_commands() -> None:
     result = CliRunner().invoke(app, ["steps", "--help"])
 
     assert result.exit_code == 0
-    assert "translation" in result.output
+    assert "list" in result.output
+    assert "show" in result.output
+    assert "run" in result.output
+
+
+def test_steps_help_does_not_expose_bespoke_translation_command() -> None:
+    """`nemotron steps translation` has been collapsed into `steps run translate/nemo_curator`."""
+
+    result = CliRunner().invoke(app, ["steps", "translation", "--help"])
+
+    assert result.exit_code != 0
+
+
+def test_root_does_not_register_step_alias() -> None:
+    result = CliRunner().invoke(app, ["step", "--help"])
+
+    assert result.exit_code != 0
+    assert "No such command" in result.output
+
+
+def test_root_does_not_register_top_level_byob_command() -> None:
+    """`nemotron byob` has been collapsed into `steps run byob/mcq`."""
+
+    result = CliRunner().invoke(app, ["byob", "--help"])
+
+    assert result.exit_code != 0
+
 
+def test_steps_show_resolves_curator_step() -> None:
+    result = CliRunner().invoke(app, ["steps", "show", "translate/nemo_curator"])
 
-def test_translation_cli_runs_checked_in_step(monkeypatch: pytest.MonkeyPatch) -> None:
-    config = {
-        "input_path": "/data/source.jsonl",
-        "output_dir": "/data/translated",
-        "source_language": "en",
-        "target_language": "hi",
-    }
-    run_mock = Mock(return_value=Path("/data/translated"))
+    assert result.exit_code == 0, result.output
+    assert "translate/nemo_curator" in result.output
 
-    monkeypatch.setattr(translation_module, "parse_recipe_config", lambda ctx: _fake_cfg())
-    monkeypatch.setattr(translation_module, "_load_translation_config", lambda cfg: config)
-    monkeypatch.setattr(translation_module, "_run_translation_step", run_mock)
 
-    translation_module.translation(ctx=Mock())
+def test_steps_show_resolves_byob_mcq_step() -> None:
+    result = CliRunner().invoke(app, ["steps", "show", "byob/mcq"])
 
-    run_mock.assert_called_once_with(config)
+    assert result.exit_code == 0, result.output
+    assert "byob/mcq" in result.output
 
 
-def test_translation_cli_rejects_remote_mode(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setattr(
-        translation_module,
-        "parse_recipe_config",
-        lambda ctx: _fake_cfg(mode="run"),
-    )
+def test_steps_show_rejects_legacy_translation_id() -> None:
+    """The legacy `translate/translation` id no longer resolves."""
 
-    with pytest.raises(typer.Exit) as exc_info:
-        translation_module.translation(ctx=Mock())
+    result = CliRunner().invoke(app, ["steps", "show", "translate/translation"])
 
-    assert exc_info.value.exit_code == 1
+    assert result.exit_code != 0
+    combined = (result.output or "") + (result.stderr or "")
+    assert "Unknown step id" in combined or "Did you mean" in combined
 
 
-def test_translation_cli_dry_run_skips_execution(monkeypatch: pytest.MonkeyPatch) -> None:
-    run_mock = Mock()
+def test_steps_show_rejects_legacy_byob_id() -> None:
+    """The legacy single-segment `byob` id no longer resolves.
 
-    monkeypatch.setattr(
-        translation_module,
-        "parse_recipe_config",
-        lambda ctx: _fake_cfg(dry_run=True),
-    )
-    monkeypatch.setattr(
-        translation_module,
-        "_load_translation_config",
-        lambda cfg: {"input_path": "/data/source.jsonl", "output_dir": "/data/translated"},
-    )
-    monkeypatch.setattr(translation_module, "_run_translation_step", run_mock)
+    The directory-tail short-form would have matched ``byob`` to a folder named
+    ``byob``, but the new layout has no such folder — the step lives at
+    ``byob/mcq``.
+    """
 
-    translation_module.translation(ctx=Mock())
+    result = CliRunner().invoke(app, ["steps", "show", "byob"])
 
-    run_mock.assert_not_called()
+    assert result.exit_code != 0
+    combined = (result.output or "") + (result.stderr or "")
+    assert "Unknown step id" in combined or "Did you mean" in combined
diff --git a/tests/steps/tier2/cases/4gpu-automodel-lora.toml b/tests/steps/tier2/cases/4gpu-automodel-lora.toml
index 4bedca20f..0fc051980 100644
--- a/tests/steps/tier2/cases/4gpu-automodel-lora.toml
+++ b/tests/steps/tier2/cases/4gpu-automodel-lora.toml
@@ -9,7 +9,7 @@ request = "Fine-tune an instruction model on my chat JSONL. We only have 4 GPUs,
 [expected.steps]
 must_include = ["sft/automodel", "eval/model_eval"]
 may_include = []
-must_exclude = ["prep/sft_packing", "sft/megatron_bridge", "curate/nemo_curator"]
+must_exclude = ["data_prep/sft_packing", "sft/megatron_bridge", "curate/nemo_curator"]
 
 [expected.plan]
 has_mermaid_diagram = true
@@ -27,4 +27,4 @@ no_relative_data_paths = true
 
 [expected.disclosure]
 must_read = ["STEPS.md", "sft/guide.md"]
-must_not_read = ["prep/guide.md"]
+must_not_read = ["data_prep/guide.md"]
diff --git a/tests/steps/tier2/cases/add-rl-after-sft.toml b/tests/steps/tier2/cases/add-rl-after-sft.toml
index 8824d4d2c..ad8dd0933 100644
--- a/tests/steps/tier2/cases/add-rl-after-sft.toml
+++ b/tests/steps/tier2/cases/add-rl-after-sft.toml
@@ -7,7 +7,7 @@ tags = ["iteration", "add-stage", "rl", "grpo", "multi-turn"]
 request = "Build a Nano3 SFT pipeline for my instruction JSONL on 2 nodes of 8x H100."
 
 [expected.steps]
-must_include = ["prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
+must_include = ["data_prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
 may_include = []
 must_exclude = ["sft/automodel"]
 
@@ -25,5 +25,5 @@ no_relative_data_paths = true
 [expected.iteration]
 followup = "Add RL with GRPO after SFT."
 must_add_stage = "03_rl"
-must_have_step = "rl/nemo_rl_grpo"
+must_have_step = "rl/nemo_rl/rlvr"
 must_mention = ["GRPO", "checkpoint_megatron"]
diff --git a/tests/steps/tier2/cases/simple-sft-english.toml b/tests/steps/tier2/cases/simple-sft-english.toml
index 8d539fc3a..64b29446b 100644
--- a/tests/steps/tier2/cases/simple-sft-english.toml
+++ b/tests/steps/tier2/cases/simple-sft-english.toml
@@ -7,9 +7,9 @@ tags = ["short", "catalog", "minimality", "english", "megatron-bridge"]
 request = "Fine-tune Nemotron Nano3 on my English chat-format JSONL dataset with a single 8x H100 node."
 
 [expected.steps]
-must_include = ["prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
+must_include = ["data_prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
 may_include = []
-must_exclude = ["curate/nemo_curator", "translate/translation", "sft/automodel"]
+must_exclude = ["curate/nemo_curator", "translate/nemo_curator", "sft/automodel"]
 
 [expected.plan]
 has_mermaid_diagram = true
@@ -26,5 +26,5 @@ no_relative_data_paths = true
 "stages/02_sft/config/default.yaml" = ["tensor_model_parallel_size: 4"]
 
 [expected.disclosure]
-must_read = ["STEPS.md", "sft/guide.md", "prep/guide.md"]
+must_read = ["STEPS.md", "sft/guide.md", "data_prep/guide.md"]
 must_not_read = []
diff --git a/tests/steps/tier2/cases/super3-multi-node.toml b/tests/steps/tier2/cases/super3-multi-node.toml
index bf31428f8..0d87b749a 100644
--- a/tests/steps/tier2/cases/super3-multi-node.toml
+++ b/tests/steps/tier2/cases/super3-multi-node.toml
@@ -7,9 +7,9 @@ tags = ["catalog", "super3", "multi-node", "parallelism", "megatron-bridge"]
 request = "Fine-tune Nemotron Super3 on 4 nodes of 8x H100 with Slurm."
 
 [expected.steps]
-must_include = ["prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
+must_include = ["data_prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
 may_include = []
-must_exclude = ["sft/automodel", "curate/nemo_curator", "translate/translation"]
+must_exclude = ["sft/automodel", "curate/nemo_curator", "translate/nemo_curator"]
 
 [expected.plan]
 has_mermaid_diagram = true
diff --git a/tests/steps/tier2/cases/swap-megatron-to-automodel.toml b/tests/steps/tier2/cases/swap-megatron-to-automodel.toml
index 6166f0e6d..1df6bdd61 100644
--- a/tests/steps/tier2/cases/swap-megatron-to-automodel.toml
+++ b/tests/steps/tier2/cases/swap-megatron-to-automodel.toml
@@ -7,9 +7,9 @@ tags = ["iteration", "swap-step", "hardware", "automodel", "megatron-bridge"]
 request = "Build an English SFT pipeline for Nemotron-style instruction tuning on one 8x H100 node."
 
 [expected.steps]
-must_include = ["prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
+must_include = ["data_prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
 may_include = []
-must_exclude = ["curate/nemo_curator", "translate/translation"]
+must_exclude = ["curate/nemo_curator", "translate/nemo_curator"]
 
 [expected.plan]
 has_mermaid_diagram = true
diff --git a/tests/steps/tier2/cases/thai-sft-nano3.toml b/tests/steps/tier2/cases/thai-sft-nano3.toml
index 270ae6c7d..726ff0654 100644
--- a/tests/steps/tier2/cases/thai-sft-nano3.toml
+++ b/tests/steps/tier2/cases/thai-sft-nano3.toml
@@ -7,9 +7,9 @@ tags = ["linear", "catalog", "multilingual", "multi-node", "nano3", "sovereign-a
 request = "Fine-tune Nemotron Nano3 for Thai customer support. We have 2 nodes of 8x H100 on Slurm."
 
 [expected.steps]
-must_include = ["curate/nemo_curator", "translate/translation", "prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
+must_include = ["curate/nemo_curator", "translate/nemo_curator", "data_prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
 may_include = []
-must_exclude = ["sft/automodel", "rl/nemo_rl_grpo"]
+must_exclude = ["sft/automodel", "rl/nemo_rl/rlvr"]
 
 [expected.plan]
 has_mermaid_diagram = true
diff --git a/tests/steps/tier2/cases/translation-chat-before-sft.toml b/tests/steps/tier2/cases/translation-chat-before-sft.toml
index 0cc75deb0..c23cd13b0 100644
--- a/tests/steps/tier2/cases/translation-chat-before-sft.toml
+++ b/tests/steps/tier2/cases/translation-chat-before-sft.toml
@@ -7,7 +7,7 @@ tags = ["translation", "chat", "multilingual", "sft"]
 request = "Translate a multi-turn chat corpus from English to Thai before SFT on Nemotron Nano3."
 
 [expected.steps]
-must_include = ["translate/translation", "prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
+must_include = ["translate/nemo_curator", "data_prep/sft_packing", "sft/megatron_bridge", "eval/model_eval"]
 may_include = []
 must_exclude = ["curate/nemo_curator", "sft/automodel"]
 
@@ -23,5 +23,5 @@ must_have_tiny_config = true
 no_relative_data_paths = true
 
 [expected.disclosure]
-must_read = ["STEPS.md", "PATTERNS.md", "translate/translation/step.toml"]
+must_read = ["STEPS.md", "PATTERNS.md", "translate/nemo_curator/step.toml"]
 must_not_read = []
diff --git a/tests/steps/tier2/cases/translation-faith-governance.toml b/tests/steps/tier2/cases/translation-faith-governance.toml
index 015b9d5eb..1eb537ef9 100644
--- a/tests/steps/tier2/cases/translation-faith-governance.toml
+++ b/tests/steps/tier2/cases/translation-faith-governance.toml
@@ -7,9 +7,9 @@ tags = ["translation", "faith", "governance", "chat"]
 request = "Translate this tool-calling chat dataset to German and include FAITH evaluation for governance review."
 
 [expected.steps]
-must_include = ["translate/translation"]
+must_include = ["translate/nemo_curator"]
 may_include = []
-must_exclude = ["curate/nemo_curator", "prep/sft_packing", "sft/megatron_bridge", "sft/automodel"]
+must_exclude = ["curate/nemo_curator", "data_prep/sft_packing", "sft/megatron_bridge", "sft/automodel"]
 
 [expected.plan]
 has_mermaid_diagram = true
@@ -23,5 +23,5 @@ must_have_tiny_config = false
 no_relative_data_paths = true
 
 [expected.disclosure]
-must_read = ["STEPS.md", "PATTERNS.md", "translate/translation/step.toml"]
+must_read = ["STEPS.md", "PATTERNS.md", "translate/nemo_curator/step.toml"]
 must_not_read = ["sft/guide.md"]
diff --git a/tests/steps/tier2/cases/translation-large-nmt-corpus.toml b/tests/steps/tier2/cases/translation-large-nmt-corpus.toml
index 66f18a1ba..7c75b8040 100644
--- a/tests/steps/tier2/cases/translation-large-nmt-corpus.toml
+++ b/tests/steps/tier2/cases/translation-large-nmt-corpus.toml
@@ -7,9 +7,9 @@ tags = ["translation", "nmt", "large-corpus", "throughput"]
 request = "Translate a large English news corpus to Hindi with a local NMT server."
 
 [expected.steps]
-must_include = ["translate/translation"]
+must_include = ["translate/nemo_curator"]
 may_include = []
-must_exclude = ["curate/nemo_curator", "prep/sft_packing", "sft/megatron_bridge", "sft/automodel", "eval/model_eval"]
+must_exclude = ["curate/nemo_curator", "data_prep/sft_packing", "sft/megatron_bridge", "sft/automodel", "eval/model_eval"]
 
 [expected.plan]
 has_mermaid_diagram = true
@@ -23,5 +23,5 @@ must_have_tiny_config = false
 no_relative_data_paths = true
 
 [expected.disclosure]
-must_read = ["STEPS.md", "PATTERNS.md", "translate/translation/step.toml"]
+must_read = ["STEPS.md", "PATTERNS.md", "translate/nemo_curator/step.toml"]
 must_not_read = ["sft/guide.md"]
diff --git a/tests/steps/tier2/cases/type-mismatch-no-prep.toml b/tests/steps/tier2/cases/type-mismatch-no-prep.toml
index 738a807da..f94ec852b 100644
--- a/tests/steps/tier2/cases/type-mismatch-no-prep.toml
+++ b/tests/steps/tier2/cases/type-mismatch-no-prep.toml
@@ -1,13 +1,13 @@
 [case]
 name = "type-mismatch-no-prep"
-description = "The agent should catch direct JSONL -> Megatron-Bridge incompatibility and insert prep/sft_packing."
-tags = ["verify", "type-mismatch", "prep", "megatron-bridge"]
+description = "The agent should catch direct JSONL -> Megatron-Bridge incompatibility and insert data_prep/sft_packing."
+tags = ["verify", "type-mismatch", "data_prep", "megatron-bridge"]
 
 [input]
 request = "Take my chat-format JSONL and feed it directly into Megatron-Bridge SFT with no packing step."
 
 [expected.steps]
-must_include = ["prep/sft_packing", "sft/megatron_bridge"]
+must_include = ["data_prep/sft_packing", "sft/megatron_bridge"]
 may_include = ["eval/model_eval"]
 must_exclude = ["sft/automodel"]
 
@@ -23,5 +23,5 @@ must_have_tiny_config = true
 no_relative_data_paths = true
 
 [expected.disclosure]
-must_read = ["STEPS.md", "prep/guide.md", "sft/guide.md"]
+must_read = ["STEPS.md", "data_prep/guide.md", "sft/guide.md"]
 must_not_read = []
diff --git a/tests/steps/tier2/cases/unsupported-model.toml b/tests/steps/tier2/cases/unsupported-model.toml
index 04d6ded2f..f4ccf18a3 100644
--- a/tests/steps/tier2/cases/unsupported-model.toml
+++ b/tests/steps/tier2/cases/unsupported-model.toml
@@ -9,7 +9,7 @@ request = "Fine-tune Falcon-Mamba-999B with a pipeline built from the current Ne
 [expected.steps]
 must_include = []
 may_include = []
-must_exclude = ["prep/sft_packing", "sft/megatron_bridge", "sft/automodel", "eval/model_eval"]
+must_exclude = ["data_prep/sft_packing", "sft/megatron_bridge", "sft/automodel", "eval/model_eval"]
 
 [expected.plan]
 has_mermaid_diagram = false
diff --git a/tests/steps/tier2/plan_graph_checker.py b/tests/steps/tier2/plan_graph_checker.py
index c359af7e1..ccf165d50 100644
--- a/tests/steps/tier2/plan_graph_checker.py
+++ b/tests/steps/tier2/plan_graph_checker.py
@@ -11,7 +11,7 @@
 
 DEFAULT_STEPS_ROOT = Path(__file__).resolve().parents[3] / "src" / "nemotron" / "steps"
 STEP_ID_PATTERN = re.compile(
-    r"(benchmark|byob|convert|curate|eval|prep|pretrain|rl|sft|sdg|synth|translate)/[a-z0-9_]+"
+    r"(benchmark|byob|convert|curate|eval|prep|pretrain|rl|sft|sdg|translate)/[a-z0-9_]+"
 )
 
 
diff --git a/use-case-examples/README.md b/use-case-examples/README.md
index d35595382..84c1572a5 100644
--- a/use-case-examples/README.md
+++ b/use-case-examples/README.md
@@ -16,4 +16,4 @@ These examples showcase **complete implementations** of agentic workflows, RAG s
 |---------|-------------|
 | [RAG Agent with Nemotron RAG Models](RAG Agent with Nemotron RAG Models/README.md) | End-to-end example of a Retrieval-Augmented Generation (RAG) agent workflow using Nemotron RAG models through Hugging Face and Nemotron 9B hosted through build.nvidia.com models |
 | [Data Science ML Agent](Data Science ML Agent/README.md) | End-to-end example of a natural language-driven data science and machine learning agent powered by NVIDIA GPUs. The agent allows users to perform data exploration, model training, and hyperparameter optimization interactively using RAPIDS cuDF and cuML for GPU acceleration.|
-| [Build Your Own Benchmark](build-your-own-benchmark/README.md) | End-to-end BYOB MCQ benchmark example using the `src/nemotron/steps/byob` step, packaged step configs, and `nemotron byob` CLI. |
+| [Build Your Own Benchmark](build-your-own-benchmark/README.md) | End-to-end BYOB MCQ benchmark example using the `src/nemotron/steps/byob` step, packaged step configs, and the `nemotron steps run byob/mcq` CLI. |
diff --git a/use-case-examples/build-your-own-benchmark/README.md b/use-case-examples/build-your-own-benchmark/README.md
index 6547fa15a..a7af341f8 100644
--- a/use-case-examples/build-your-own-benchmark/README.md
+++ b/use-case-examples/build-your-own-benchmark/README.md
@@ -5,8 +5,8 @@ This example shows the current Nemotron BYOB flow for creating a domain-specific
 The notebook is intentionally aligned with the agentic step structure:
 
 - The reusable step lives in `src/nemotron/steps/byob/`.
-- Starter configs live in `src/nemotron/steps/byob/config/`.
-- Runtime execution goes through `nemotron byob`.
+- Starter configs live in `src/nemotron/steps/byob/mcq/config/`.
+- Runtime execution goes through `nemotron steps run byob/mcq`.
 - The notebook creates only example-local inputs, outputs, and a working config.
 
 ## Files
@@ -31,13 +31,19 @@ The notebook defaults to a dry command preview so it does not accidentally spend
 The equivalent CLI is:
 
 ```bash
-uv run nemotron byob --list-families
-uv run nemotron byob --family mcq --stage prepare --config use-case-examples/build-your-own-benchmark/config/finance_wiki.yaml
-uv run nemotron byob --family mcq --stage generate --config use-case-examples/build-your-own-benchmark/config/finance_wiki.yaml
+uv run nemotron steps show byob/mcq                          # see family.choices, parameters
+uv run nemotron steps run byob/mcq \
+  -c use-case-examples/build-your-own-benchmark/config/finance_wiki.yaml \
+  stage=prepare family=mcq
+uv run nemotron steps run byob/mcq \
+  -c use-case-examples/build-your-own-benchmark/config/finance_wiki.yaml \
+  stage=generate family=mcq
 ```
 
 Optional translation uses:
 
 ```bash
-uv run nemotron byob --family mcq --stage translate --config use-case-examples/build-your-own-benchmark/config/finance_wiki_translate.yaml
+uv run nemotron steps run byob/mcq \
+  -c use-case-examples/build-your-own-benchmark/config/finance_wiki_translate.yaml \
+  stage=translate family=mcq
 ```
diff --git a/use-case-examples/build-your-own-benchmark/build_mcq_benchmark.ipynb b/use-case-examples/build-your-own-benchmark/build_mcq_benchmark.ipynb
index f500070d3..b259c3c72 100644
--- a/use-case-examples/build-your-own-benchmark/build_mcq_benchmark.ipynb
+++ b/use-case-examples/build-your-own-benchmark/build_mcq_benchmark.ipynb
@@ -6,10 +6,11 @@
       "source": [
         "# Build Your Own Benchmark: finance MCQ example\n",
         "\n",
-        "This notebook demonstrates the current Nemotron BYOB step structure. It does not import MCQ pipeline internals directly. Instead, it prepares an example corpus, writes a working config from `src/nemotron/steps/byob/config/default.yaml`, and runs the public CLI surface: `nemotron byob`.\n",
+        "This notebook demonstrates the current Nemotron BYOB step structure. It does not import MCQ pipeline internals directly. Instead, it prepares an example corpus, writes a working config from `src/nemotron/steps/byob/mcq/config/default.yaml`, and runs the public CLI surface: `nemotron steps run byob/mcq`.\n",
         "\n",
         "The generated benchmark keeps the MMLU-Pro-style BYOB schema documented in `src/nemotron/steps/byob/references/benchmark-schema.md`."
-      ]
+      ],
+      "id": "01589e19"
     },
     {
       "cell_type": "markdown",
@@ -35,9 +36,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "from __future__ import annotations\n",
         "\n",
@@ -59,7 +58,7 @@
         "REPO_ROOT = find_repo_root(Path.cwd().resolve())\n",
         "SRC_DIR = REPO_ROOT / \"src\"\n",
         "EXAMPLE_DIR = REPO_ROOT / \"use-case-examples\" / \"build-your-own-benchmark\"\n",
-        "STEP_CONFIG_DIR = SRC_DIR / \"nemotron\" / \"steps\" / \"byob\" / \"config\"\n",
+        "STEP_CONFIG_DIR = SRC_DIR / \"nemotron\" / \"steps\" / \"byob\" / \"mcq\" / \"config\"\n",
         "STEP_DEFAULT_CONFIG = STEP_CONFIG_DIR / \"default.yaml\"\n",
         "STEP_TRANSLATE_CONFIG = STEP_CONFIG_DIR / \"translate.yaml\"\n",
         "\n",
@@ -79,7 +78,9 @@
         "print(\"Packaged BYOB config:\", STEP_DEFAULT_CONFIG)\n",
         "print(\"Example corpus dir:\", ASSETS_DIR)\n",
         "print(\"Working config:\", WORKING_CONFIG)"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -92,9 +93,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "NGC_API_KEY = None\n",
         "NVIDIA_API_KEY = None\n",
@@ -115,7 +114,9 @@
         "print(\"NGC_API_KEY set:\", bool(os.environ.get(\"NGC_API_KEY\")))\n",
         "print(\"NVIDIA_API_KEY set:\", bool(os.environ.get(\"NVIDIA_API_KEY\")))\n",
         "print(\"HF_TOKEN set:\", bool(os.environ.get(\"HF_TOKEN\") or os.environ.get(\"HUGGING_FACE_HUB_TOKEN\")))"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -128,9 +129,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "import json\n",
         "import re\n",
@@ -188,7 +187,9 @@
         "\n",
         "for path in sorted(ASSETS_DIR.glob(\"*.txt\")):\n",
         "    print(path.name, path.stat().st_size, \"bytes\")"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -201,9 +202,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "cfg = yaml.safe_load(STEP_DEFAULT_CONFIG.read_text(encoding=\"utf-8\"))\n",
         "\n",
@@ -235,7 +234,9 @@
         "\n",
         "WORKING_CONFIG.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding=\"utf-8\")\n",
         "print(WORKING_CONFIG.read_text(encoding=\"utf-8\"))"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -246,21 +247,29 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "def run_nemotron_byob(*args: str, check: bool = True) -> subprocess.CompletedProcess[str]:\n",
+        "    \"\"\"Invoke `nemotron steps run byob/mcq` with the given trailing arguments.\"\"\"\n",
+        "\n",
         "    env = os.environ.copy()\n",
         "    existing_pythonpath = env.get(\"PYTHONPATH\", \"\")\n",
         "    env[\"PYTHONPATH\"] = str(SRC_DIR) + (os.pathsep + existing_pythonpath if existing_pythonpath else \"\")\n",
-        "    cmd = [sys.executable, \"-m\", \"nemotron\", \"byob\", *map(str, args)]\n",
+        "    cmd = [sys.executable, \"-m\", \"nemotron\", \"steps\", \"run\", \"byob/mcq\", *map(str, args)]\n",
         "    print(\"$\", \" \".join(cmd))\n",
         "    return subprocess.run(cmd, cwd=REPO_ROOT, env=env, text=True, check=check)\n",
         "\n",
         "\n",
-        "run_nemotron_byob(\"--list-families\")"
-      ]
+        "# `steps show <id>` is the discovery surface for parameters and family choices.\n",
+        "subprocess.run(\n",
+        "    [sys.executable, \"-m\", \"nemotron\", \"steps\", \"show\", \"byob/mcq\"],\n",
+        "    cwd=REPO_ROOT,\n",
+        "    text=True,\n",
+        "    check=False,\n",
+        ")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -271,14 +280,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "RUN_BYOB = False\n",
         "\n",
-        "prepare_args = [\"--family\", \"mcq\", \"--stage\", \"prepare\", \"--config\", str(WORKING_CONFIG)]\n",
-        "generate_args = [\"--family\", \"mcq\", \"--stage\", \"generate\", \"--config\", str(WORKING_CONFIG)]\n",
+        "prepare_args = [\"-c\", str(WORKING_CONFIG), \"stage=prepare\", \"family=mcq\"]\n",
+        "generate_args = [\"-c\", str(WORKING_CONFIG), \"stage=generate\", \"family=mcq\"]\n",
         "\n",
         "if RUN_BYOB:\n",
         "    if not (os.environ.get(\"NGC_API_KEY\") or os.environ.get(\"NVIDIA_API_KEY\")):\n",
@@ -287,9 +294,11 @@
         "    run_nemotron_byob(*generate_args)\n",
         "else:\n",
         "    print(\"Preview only. Commands to run:\")\n",
-        "    print(\"python -m nemotron byob\", \" \".join(prepare_args))\n",
-        "    print(\"python -m nemotron byob\", \" \".join(generate_args))"
-      ]
+        "    print(\"python -m nemotron steps run byob/mcq\", \" \".join(prepare_args))\n",
+        "    print(\"python -m nemotron steps run byob/mcq\", \" \".join(generate_args))"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -300,9 +309,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "import pandas as pd\n",
         "from IPython.display import display\n",
@@ -320,7 +327,9 @@
         "    display(df.head())\n",
         "else:\n",
         "    print(\"Run the BYOB prepare/generate cell first, or point final_benchmark at an existing output.\")"
-      ]
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -333,9 +342,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "translate_cfg = yaml.safe_load(STEP_TRANSLATE_CONFIG.read_text(encoding=\"utf-8\"))\n",
         "translate_cfg.update(\n",
@@ -351,7 +358,7 @@
         "print(TRANSLATION_CONFIG.read_text(encoding=\"utf-8\"))\n",
         "\n",
         "RUN_TRANSLATION = False\n",
-        "translation_args = [\"--family\", \"mcq\", \"--stage\", \"translate\", \"--config\", str(TRANSLATION_CONFIG)]\n",
+        "translation_args = [\"-c\", str(TRANSLATION_CONFIG), \"stage=translate\", \"family=mcq\"]\n",
         "\n",
         "if RUN_TRANSLATION:\n",
         "    if not final_benchmark.exists():\n",
@@ -361,8 +368,11 @@
         "    run_nemotron_byob(*translation_args)\n",
         "else:\n",
         "    print(\"Preview only. Command to run:\")\n",
-        "    print(\"python -m nemotron byob\", \" \".join(translation_args))"
-      ]
+        "    print(\"python -m nemotron steps run byob/mcq\", \" \".join(translation_args))"
+      ],
+      "execution_count": null,
+      "outputs": [],
+      "id": "9c7bd4f7"
     }
   ],
   "metadata": {
@@ -385,4 +395,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/uv.lock b/uv.lock
index edf44d045..473ce254e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,35 +1,32 @@
 version = 1
 revision = 3
-requires-python = ">=3.10"
+requires-python = ">=3.10, <3.14"
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
     "python_full_version < '3.11' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
 
 [manifest]
-constraints = [{ name = "transformers", specifier = ">=4.56.0,<5.0" }]
+constraints = [
+    { name = "cryptography", specifier = ">=48.0.0" },
+    { name = "gitpython", specifier = ">=3.1.50" },
+    { name = "pytest", specifier = ">=9.0.3" },
+    { name = "python-multipart", specifier = ">=0.0.29" },
+    { name = "transformers", specifier = ">=4.57.6,<5.0" },
+]
 overrides = [
     { name = "huggingface-hub", specifier = ">=0.34,<1.0" },
     { name = "torch", specifier = "==2.10.0" },
+    { name = "urllib3", specifier = ">=2.7.0,<3" },
 ]
 
 [[package]]
@@ -165,40 +162,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356, upload-time = "2026-03-31T21:58:44.049Z" },
     { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637, upload-time = "2026-03-31T21:58:46.167Z" },
     { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896, upload-time = "2026-03-31T21:58:48.119Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/ce/46572759afc859e867a5bc8ec3487315869013f59281ce61764f76d879de/aiohttp-3.13.5-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:eb4639f32fd4a9904ab8fb45bf3383ba71137f3d9d4ba25b3b3f3109977c5b8c", size = 745721, upload-time = "2026-03-31T21:58:50.229Z" },
-    { url = "https://files.pythonhosted.org/packages/13/fe/8a2efd7626dbe6049b2ef8ace18ffda8a4dfcbe1bcff3ac30c0c7575c20b/aiohttp-3.13.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:7e5dc4311bd5ac493886c63cbf76ab579dbe4641268e7c74e48e774c74b6f2be", size = 497663, upload-time = "2026-03-31T21:58:52.232Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/91/cc8cc78a111826c54743d88651e1687008133c37e5ee615fee9b57990fac/aiohttp-3.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:756c3c304d394977519824449600adaf2be0ccee76d206ee339c5e76b70ded25", size = 499094, upload-time = "2026-03-31T21:58:54.566Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/33/a8362cb15cf16a3af7e86ed11962d5cd7d59b449202dc576cdc731310bde/aiohttp-3.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecc26751323224cf8186efcf7fbcbc30f4e1d8c7970659daf25ad995e4032a56", size = 1726701, upload-time = "2026-03-31T21:58:56.864Z" },
-    { url = "https://files.pythonhosted.org/packages/45/0c/c091ac5c3a17114bd76cbf85d674650969ddf93387876cf67f754204bd77/aiohttp-3.13.5-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10a75acfcf794edf9d8db50e5a7ec5fc818b2a8d3f591ce93bc7b1210df016d2", size = 1683360, upload-time = "2026-03-31T21:58:59.072Z" },
-    { url = "https://files.pythonhosted.org/packages/23/73/bcee1c2b79bc275e964d1446c55c54441a461938e70267c86afaae6fba27/aiohttp-3.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f7a18f258d124cd678c5fe072fe4432a4d5232b0657fca7c1847f599233c83a", size = 1773023, upload-time = "2026-03-31T21:59:01.776Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/ef/720e639df03004fee2d869f771799d8c23046dec47d5b81e396c7cda583a/aiohttp-3.13.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:df6104c009713d3a89621096f3e3e88cc323fd269dbd7c20afe18535094320be", size = 1853795, upload-time = "2026-03-31T21:59:04.568Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/c9/989f4034fb46841208de7aeeac2c6d8300745ab4f28c42f629ba77c2d916/aiohttp-3.13.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a94f7de7c0c3b616627aaad530fe2cb620084a8b144d3be7b6ecfe95bae3b", size = 1730405, upload-time = "2026-03-31T21:59:07.221Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/75/ee1fd286ca7dc599d824b5651dad7b3be7ff8d9a7e7b3fe9820d9180f7db/aiohttp-3.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c974fb66180e58709b6fc402846f13791240d180b74de81d23913abe48e96d94", size = 1558082, upload-time = "2026-03-31T21:59:09.484Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/20/1e9e6650dfc436340116b7aa89ff8cb2bbdf0abc11dfaceaad8f74273a10/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6e27ea05d184afac78aabbac667450c75e54e35f62238d44463131bd3f96753d", size = 1692346, upload-time = "2026-03-31T21:59:12.068Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/40/8ebc6658d48ea630ac7903912fe0dd4e262f0e16825aa4c833c56c9f1f56/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a79a6d399cef33a11b6f004c67bb07741d91f2be01b8d712d52c75711b1e07c7", size = 1698891, upload-time = "2026-03-31T21:59:14.552Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/78/ea0ae5ec8ba7a5c10bdd6e318f1ba5e76fcde17db8275188772afc7917a4/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c632ce9c0b534fbe25b52c974515ed674937c5b99f549a92127c85f771a78772", size = 1742113, upload-time = "2026-03-31T21:59:17.068Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/66/9d308ed71e3f2491be1acb8769d96c6f0c47d92099f3bc9119cada27b357/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:fceedde51fbd67ee2bcc8c0b33d0126cc8b51ef3bbde2f86662bd6d5a6f10ec5", size = 1553088, upload-time = "2026-03-31T21:59:19.541Z" },
-    { url = "https://files.pythonhosted.org/packages/da/a6/6cc25ed8dfc6e00c90f5c6d126a98e2cf28957ad06fa1036bd34b6f24a2c/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f92995dfec9420bb69ae629abf422e516923ba79ba4403bc750d94fb4a6c68c1", size = 1757976, upload-time = "2026-03-31T21:59:22.311Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/2b/cce5b0ffe0de99c83e5e36d8f828e4161e415660a9f3e58339d07cce3006/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20ae0ff08b1f2c8788d6fb85afcb798654ae6ba0b747575f8562de738078457b", size = 1712444, upload-time = "2026-03-31T21:59:24.635Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/cf/9e1795b4160c58d29421eafd1a69c6ce351e2f7c8d3c6b7e4ca44aea1a5b/aiohttp-3.13.5-cp314-cp314-win32.whl", hash = "sha256:b20df693de16f42b2472a9c485e1c948ee55524786a0a34345511afdd22246f3", size = 438128, upload-time = "2026-03-31T21:59:27.291Z" },
-    { url = "https://files.pythonhosted.org/packages/22/4d/eaedff67fc805aeba4ba746aec891b4b24cebb1a7d078084b6300f79d063/aiohttp-3.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:f85c6f327bf0b8c29da7d93b1cabb6363fb5e4e160a32fa241ed2dce21b73162", size = 464029, upload-time = "2026-03-31T21:59:29.429Z" },
-    { url = "https://files.pythonhosted.org/packages/79/11/c27d9332ee20d68dd164dc12a6ecdef2e2e35ecc97ed6cf0d2442844624b/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1efb06900858bb618ff5cee184ae2de5828896c448403d51fb633f09e109be0a", size = 778758, upload-time = "2026-03-31T21:59:31.547Z" },
-    { url = "https://files.pythonhosted.org/packages/04/fb/377aead2e0a3ba5f09b7624f702a964bdf4f08b5b6728a9799830c80041e/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fee86b7c4bd29bdaf0d53d14739b08a106fdda809ca5fe032a15f52fae5fe254", size = 512883, upload-time = "2026-03-31T21:59:34.098Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/a6/aa109a33671f7a5d3bd78b46da9d852797c5e665bfda7d6b373f56bff2ec/aiohttp-3.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:20058e23909b9e65f9da62b396b77dfa95965cbe840f8def6e572538b1d32e36", size = 516668, upload-time = "2026-03-31T21:59:36.497Z" },
-    { url = "https://files.pythonhosted.org/packages/79/b3/ca078f9f2fa9563c36fb8ef89053ea2bb146d6f792c5104574d49d8acb63/aiohttp-3.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cf20a8d6868cb15a73cab329ffc07291ba8c22b1b88176026106ae39aa6df0f", size = 1883461, upload-time = "2026-03-31T21:59:38.723Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/e3/a7ad633ca1ca497b852233a3cce6906a56c3225fb6d9217b5e5e60b7419d/aiohttp-3.13.5-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:330f5da04c987f1d5bdb8ae189137c77139f36bd1cb23779ca1a354a4b027800", size = 1747661, upload-time = "2026-03-31T21:59:41.187Z" },
-    { url = "https://files.pythonhosted.org/packages/33/b9/cd6fe579bed34a906d3d783fe60f2fa297ef55b27bb4538438ee49d4dc41/aiohttp-3.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f1cbf0c7926d315c3c26c2da41fd2b5d2fe01ac0e157b78caefc51a782196cf", size = 1863800, upload-time = "2026-03-31T21:59:43.84Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/3f/2c1e2f5144cefa889c8afd5cf431994c32f3b29da9961698ff4e3811b79a/aiohttp-3.13.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:53fc049ed6390d05423ba33103ded7281fe897cf97878f369a527070bd95795b", size = 1958382, upload-time = "2026-03-31T21:59:46.187Z" },
-    { url = "https://files.pythonhosted.org/packages/66/1d/f31ec3f1013723b3babe3609e7f119c2c2fb6ef33da90061a705ef3e1bc8/aiohttp-3.13.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:898703aa2667e3c5ca4c54ca36cd73f58b7a38ef87a5606414799ebce4d3fd3a", size = 1803724, upload-time = "2026-03-31T21:59:48.656Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/b4/57712dfc6f1542f067daa81eb61da282fab3e6f1966fca25db06c4fc62d5/aiohttp-3.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0494a01ca9584eea1e5fbd6d748e61ecff218c51b576ee1999c23db7066417d8", size = 1640027, upload-time = "2026-03-31T21:59:51.284Z" },
-    { url = "https://files.pythonhosted.org/packages/25/3c/734c878fb43ec083d8e31bf029daae1beafeae582d1b35da234739e82ee7/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6cf81fe010b8c17b09495cbd15c1d35afbc8fb405c0c9cf4738e5ae3af1d65be", size = 1806644, upload-time = "2026-03-31T21:59:53.753Z" },
-    { url = "https://files.pythonhosted.org/packages/20/a5/f671e5cbec1c21d044ff3078223f949748f3a7f86b14e34a365d74a5d21f/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:c564dd5f09ddc9d8f2c2d0a301cd30a79a2cc1b46dd1a73bef8f0038863d016b", size = 1791630, upload-time = "2026-03-31T21:59:56.239Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/63/fb8d0ad63a0b8a99be97deac8c04dacf0785721c158bdf23d679a87aa99e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2994be9f6e51046c4f864598fd9abeb4fba6e88f0b2152422c9666dcd4aea9c6", size = 1809403, upload-time = "2026-03-31T21:59:59.103Z" },
-    { url = "https://files.pythonhosted.org/packages/59/0c/bfed7f30662fcf12206481c2aac57dedee43fe1c49275e85b3a1e1742294/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:157826e2fa245d2ef46c83ea8a5faf77ca19355d278d425c29fda0beb3318037", size = 1634924, upload-time = "2026-03-31T22:00:02.116Z" },
-    { url = "https://files.pythonhosted.org/packages/17/d6/fd518d668a09fd5a3319ae5e984d4d80b9a4b3df4e21c52f02251ef5a32e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a8aca50daa9493e9e13c0f566201a9006f080e7c50e5e90d0b06f53146a54500", size = 1836119, upload-time = "2026-03-31T22:00:04.756Z" },
-    { url = "https://files.pythonhosted.org/packages/78/b7/15fb7a9d52e112a25b621c67b69c167805cb1f2ab8f1708a5c490d1b52fe/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3b13560160d07e047a93f23aaa30718606493036253d5430887514715b67c9d9", size = 1772072, upload-time = "2026-03-31T22:00:07.494Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/df/57ba7f0c4a553fc2bd8b6321df236870ec6fd64a2a473a8a13d4f733214e/aiohttp-3.13.5-cp314-cp314t-win32.whl", hash = "sha256:9a0f4474b6ea6818b41f82172d799e4b3d29e22c2c520ce4357856fced9af2f8", size = 471819, upload-time = "2026-03-31T22:00:10.277Z" },
-    { url = "https://files.pythonhosted.org/packages/62/29/2f8418269e46454a26171bfdd6a055d74febf32234e474930f2f60a17145/aiohttp-3.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:18a2f6c1182c51baa1d28d68fea51513cb2a76612f038853c0ad3c145423d3d9", size = 505441, upload-time = "2026-03-31T22:00:12.791Z" },
 ]
 
 [[package]]
@@ -292,6 +255,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a1/ee/48ca1a7c89ffec8b6a0c5d02b89c305671d5ffd8d3c94acf8b8c408575bb/anyio-4.9.0-py3-none-any.whl", hash = "sha256:9f76d541cad6e36af7beb62e978876f3b41e3e04f2c1fbf0884604c0a9c4d93c", size = 100916, upload-time = "2025-03-17T00:02:52.713Z" },
 ]
 
+[[package]]
+name = "argcomplete"
+version = "3.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/38/61/0b9ae6399dd4a58d8c1b1dc5a27d6f2808023d0b5dd3104bb99f45a33ff6/argcomplete-3.6.3.tar.gz", hash = "sha256:62e8ed4fd6a45864acc8235409461b72c9a28ee785a2011cc5eb78318786c89c", size = 73754, upload-time = "2025-10-20T03:33:34.741Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/74/f5/9373290775639cb67a2fce7f629a1c240dce9f12fe927bc32b2736e16dfc/argcomplete-3.6.3-py3-none-any.whl", hash = "sha256:f5007b3a600ccac5d25bbce33089211dfd49eab4a7718da3f10e3082525a92ce", size = 43846, upload-time = "2025-10-20T03:33:33.021Z" },
+]
+
 [[package]]
 name = "astroid"
 version = "3.3.11"
@@ -361,19 +333,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/10/a6/ffb49d4254ed085e62e3e5dd05982b4393e32fe1e49bb1130186617c29cd/bcrypt-5.0.0-cp313-cp313t-win32.whl", hash = "sha256:9d52ed507c2488eddd6a95bccee4e808d3234fa78dd370e24bac65a21212b861", size = 148498, upload-time = "2025-09-25T19:49:24.134Z" },
     { url = "https://files.pythonhosted.org/packages/48/a9/259559edc85258b6d5fc5471a62a3299a6aa37a6611a169756bf4689323c/bcrypt-5.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f6984a24db30548fd39a44360532898c33528b74aedf81c26cf29c51ee47057e", size = 145853, upload-time = "2025-09-25T19:49:25.702Z" },
     { url = "https://files.pythonhosted.org/packages/2d/df/9714173403c7e8b245acf8e4be8876aac64a209d1b392af457c79e60492e/bcrypt-5.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:9fffdb387abe6aa775af36ef16f55e318dcda4194ddbf82007a6f21da29de8f5", size = 139626, upload-time = "2025-09-25T19:49:26.928Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/14/c18006f91816606a4abe294ccc5d1e6f0e42304df5a33710e9e8e95416e1/bcrypt-5.0.0-cp314-cp314t-macosx_10_12_universal2.whl", hash = "sha256:4870a52610537037adb382444fefd3706d96d663ac44cbb2f37e3919dca3d7ef", size = 481862, upload-time = "2025-09-25T19:49:28.365Z" },
-    { url = "https://files.pythonhosted.org/packages/67/49/dd074d831f00e589537e07a0725cf0e220d1f0d5d8e85ad5bbff251c45aa/bcrypt-5.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:48f753100931605686f74e27a7b49238122aa761a9aefe9373265b8b7aa43ea4", size = 268544, upload-time = "2025-09-25T19:49:30.39Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/91/50ccba088b8c474545b034a1424d05195d9fcbaaf802ab8bfe2be5a4e0d7/bcrypt-5.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f70aadb7a809305226daedf75d90379c397b094755a710d7014b8b117df1ebbf", size = 271787, upload-time = "2025-09-25T19:49:32.144Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/e7/d7dba133e02abcda3b52087a7eea8c0d4f64d3e593b4fffc10c31b7061f3/bcrypt-5.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:744d3c6b164caa658adcb72cb8cc9ad9b4b75c7db507ab4bc2480474a51989da", size = 269753, upload-time = "2025-09-25T19:49:33.885Z" },
-    { url = "https://files.pythonhosted.org/packages/33/fc/5b145673c4b8d01018307b5c2c1fc87a6f5a436f0ad56607aee389de8ee3/bcrypt-5.0.0-cp314-cp314t-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a28bc05039bdf3289d757f49d616ab3efe8cf40d8e8001ccdd621cd4f98f4fc9", size = 289587, upload-time = "2025-09-25T19:49:35.144Z" },
-    { url = "https://files.pythonhosted.org/packages/27/d7/1ff22703ec6d4f90e62f1a5654b8867ef96bafb8e8102c2288333e1a6ca6/bcrypt-5.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7f277a4b3390ab4bebe597800a90da0edae882c6196d3038a73adf446c4f969f", size = 272178, upload-time = "2025-09-25T19:49:36.793Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/88/815b6d558a1e4d40ece04a2f84865b0fef233513bd85fd0e40c294272d62/bcrypt-5.0.0-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:79cfa161eda8d2ddf29acad370356b47f02387153b11d46042e93a0a95127493", size = 269295, upload-time = "2025-09-25T19:49:38.164Z" },
-    { url = "https://files.pythonhosted.org/packages/51/8c/e0db387c79ab4931fc89827d37608c31cc57b6edc08ccd2386139028dc0d/bcrypt-5.0.0-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:a5393eae5722bcef046a990b84dff02b954904c36a194f6cfc817d7dca6c6f0b", size = 271700, upload-time = "2025-09-25T19:49:39.917Z" },
-    { url = "https://files.pythonhosted.org/packages/06/83/1570edddd150f572dbe9fc00f6203a89fc7d4226821f67328a85c330f239/bcrypt-5.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7f4c94dec1b5ab5d522750cb059bb9409ea8872d4494fd152b53cca99f1ddd8c", size = 334034, upload-time = "2025-09-25T19:49:41.227Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/f2/ea64e51a65e56ae7a8a4ec236c2bfbdd4b23008abd50ac33fbb2d1d15424/bcrypt-5.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0cae4cb350934dfd74c020525eeae0a5f79257e8a201c0c176f4b84fdbf2a4b4", size = 352766, upload-time = "2025-09-25T19:49:43.08Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/d4/1a388d21ee66876f27d1a1f41287897d0c0f1712ef97d395d708ba93004c/bcrypt-5.0.0-cp314-cp314t-win32.whl", hash = "sha256:b17366316c654e1ad0306a6858e189fc835eca39f7eb2cafd6aaca8ce0c40a2e", size = 152449, upload-time = "2025-09-25T19:49:44.971Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/61/3291c2243ae0229e5bca5d19f4032cecad5dfb05a2557169d3a69dc0ba91/bcrypt-5.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:92864f54fb48b4c718fc92a32825d0e42265a627f956bc0361fe869f1adc3e7d", size = 149310, upload-time = "2025-09-25T19:49:46.162Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/89/4b01c52ae0c1a681d4021e5dd3e45b111a8fb47254a274fa9a378d8d834b/bcrypt-5.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dd19cf5184a90c873009244586396a6a884d591a5323f0e8a5922560718d4993", size = 143761, upload-time = "2025-09-25T19:49:47.345Z" },
     { url = "https://files.pythonhosted.org/packages/84/29/6237f151fbfe295fe3e074ecc6d44228faa1e842a81f6d34a02937ee1736/bcrypt-5.0.0-cp38-abi3-macosx_10_12_universal2.whl", hash = "sha256:fc746432b951e92b58317af8e0ca746efe93e66555f1b40888865ef5bf56446b", size = 494553, upload-time = "2025-09-25T19:49:49.006Z" },
     { url = "https://files.pythonhosted.org/packages/45/b6/4c1205dde5e464ea3bd88e8742e19f899c16fa8916fb8510a851fae985b5/bcrypt-5.0.0-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c2388ca94ffee269b6038d48747f4ce8df0ffbea43f31abfa18ac72f0218effb", size = 275009, upload-time = "2025-09-25T19:49:50.581Z" },
     { url = "https://files.pythonhosted.org/packages/3b/71/427945e6ead72ccffe77894b2655b695ccf14ae1866cd977e185d606dd2f/bcrypt-5.0.0-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:560ddb6ec730386e7b3b26b8b4c88197aaed924430e7b74666a586ac997249ef", size = 278029, upload-time = "2025-09-25T19:49:52.533Z" },
@@ -423,6 +382,69 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
 ]
 
+[[package]]
+name = "blinker"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" },
+]
+
+[[package]]
+name = "blis"
+version = "1.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d0/d0/d8cc8c9a4488a787e7fa430f6055e5bd1ddb22c340a751d9e901b82e2efe/blis-1.3.3.tar.gz", hash = "sha256:034d4560ff3cc43e8aa37e188451b0440e3261d989bb8a42ceee865607715ecd", size = 2644873, upload-time = "2025-11-17T12:28:30.511Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/db/d80daf6c060618c72acecf026410b806f620cdea62b2e72f3235d7389d05/blis-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:650f1d2b28e3c875927c63deebda463a6f9d237dff30e445bfe2127718c1a344", size = 6925724, upload-time = "2025-11-17T12:27:14.23Z" },
+    { url = "https://files.pythonhosted.org/packages/06/cd/7ac854c92e33cfccc0eded48e979a9fc26a447952d07a9c7c7da7c1d6eec/blis-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9b0d42420ddd543eec51ccb99d38364a0c0833b6895eced37127822de6ecacff", size = 1233606, upload-time = "2025-11-17T12:27:16.107Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/ae/ad3165fdbc4ef6afef585686a778c72cd67fb5aa16ab2fd2f4494186705e/blis-1.3.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f0628a030d44aa71cac5973e40c9e95ec767abaaf2fd366a094b9398885f82f2", size = 2769094, upload-time = "2025-11-17T12:27:17.883Z" },
+    { url = "https://files.pythonhosted.org/packages/25/d4/7b0820f139b4ea67606d01b59ba6afbee4552ce7b2fd179f2fb7908e294f/blis-1.3.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0114cf2d8f19e0ed210f9ae92594cd0a12efa1bbbce444028b0fc365bbbb8af", size = 11300520, upload-time = "2025-11-17T12:27:20.058Z" },
+    { url = "https://files.pythonhosted.org/packages/85/f3/865a4322bdbeb944744c1908e67fdabecd476613a17204956cff12d568c9/blis-1.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7e88181e9dd8430029ebaf22d41bf79e756e8c95363e9471717102c66beb4a6d", size = 2962083, upload-time = "2025-11-17T12:27:22.098Z" },
+    { url = "https://files.pythonhosted.org/packages/65/a2/c2842fa1e2e6bd56eb93e41b34859a9af8b5b63669ee0442bea585d8f607/blis-1.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:62fb8c731347b0f98f5f81d19d339049e61489798738467d156c66cc329b0754", size = 14177001, upload-time = "2025-11-17T12:27:24.345Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/9b/3b1532f23db8bdddf3a976e9acf51e8debd94c63be5dafb8ccbab3e62935/blis-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:631836d4f335e62c30aa50a1aa0170773265c73654d296361f95180006e88c04", size = 6184429, upload-time = "2025-11-17T12:27:27.054Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/0a/a4c8736bc497d386b0ffc76d321f478c03f1a4725e52092f93b38beb3786/blis-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e10c8d3e892b1dbdff365b9d00e08291876fc336915bf1a5e9f188ed087e1a91", size = 6925522, upload-time = "2025-11-17T12:27:29.199Z" },
+    { url = "https://files.pythonhosted.org/packages/83/5a/3437009282f23684ecd3963a8b034f9307cdd2bf4484972e5a6b096bf9ac/blis-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:66e6249564f1db22e8af1e0513ff64134041fa7e03c8dd73df74db3f4d8415a7", size = 1232787, upload-time = "2025-11-17T12:27:30.996Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/0e/82221910d16259ce3017c1442c468a3f206a4143a96fbba9f5b5b81d62e8/blis-1.3.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7260da065958b4e5475f62f44895ef9d673b0f47dcf61b672b22b7dae1a18505", size = 2844596, upload-time = "2025-11-17T12:27:32.601Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/93/ab547f1a5c23e20bca16fbcf04021c32aac3f969be737ea4980509a7ca90/blis-1.3.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9327a6ca67de8ae76fe071e8584cc7f3b2e8bfadece4961d40f2826e1cda2df", size = 11377746, upload-time = "2025-11-17T12:27:35.342Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/a6/7733820aa62da32526287a63cd85c103b2b323b186c8ee43b7772ff7017c/blis-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c4ae70629cf302035d268858a10ca4eb6242a01b2dc8d64422f8e6dcb8a8ee74", size = 3041954, upload-time = "2025-11-17T12:27:37.479Z" },
+    { url = "https://files.pythonhosted.org/packages/87/53/e39d67fd3296b649772780ca6aab081412838ecb54e0b0c6432d01626a50/blis-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:45866a9027d43b93e8b59980a23c5d7358b6536fc04606286e39fdcfce1101c2", size = 14251222, upload-time = "2025-11-17T12:27:39.705Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/44/b749f8777b020b420bceaaf60f66432fc30cc904ca5b69640ec9cbef11ed/blis-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:27f82b8633030f8d095d2b412dffa7eb6dbc8ee43813139909a20012e54422ea", size = 6171233, upload-time = "2025-11-17T12:27:41.921Z" },
+    { url = "https://files.pythonhosted.org/packages/16/d1/429cf0cf693d4c7dc2efed969bd474e315aab636e4a95f66c4ed7264912d/blis-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a1c74e100665f8e918ebdbae2794576adf1f691680b5cdb8b29578432f623ef", size = 6929663, upload-time = "2025-11-17T12:27:44.482Z" },
+    { url = "https://files.pythonhosted.org/packages/11/69/363c8df8d98b3cc97be19aad6aabb2c9c53f372490d79316bdee92d476e7/blis-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3f6c595185176ce021316263e1a1d636a3425b6c48366c1fd712d08d0b71849a", size = 1230939, upload-time = "2025-11-17T12:27:46.19Z" },
+    { url = "https://files.pythonhosted.org/packages/96/2a/fbf65d906d823d839076c5150a6f8eb5ecbc5f9135e0b6510609bda1e6b7/blis-1.3.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d734b19fba0be7944f272dfa7b443b37c61f9476d9ab054a9ac53555ceadd2e0", size = 2818835, upload-time = "2025-11-17T12:27:48.167Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/ad/58deaa3ad856dd3cc96493e40ffd2ed043d18d4d304f85a65cde1ccbf644/blis-1.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ef6d6e2b599a3a2788eb6d9b443533961265aa4ec49d574ed4bb846e548dcdb", size = 11366550, upload-time = "2025-11-17T12:27:49.958Z" },
+    { url = "https://files.pythonhosted.org/packages/78/82/816a7adfe1f7acc8151f01ec86ef64467a3c833932d8f19f8e06613b8a4e/blis-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8c888438ae99c500422d50698e3028b65caa8ebb44e24204d87fda2df64058f7", size = 3023686, upload-time = "2025-11-17T12:27:52.062Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/e2/0e93b865f648b5519360846669a35f28ee8f4e1d93d054f6850d8afbabde/blis-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8177879fd3590b5eecdd377f9deafb5dc8af6d684f065bd01553302fb3fcf9a7", size = 14250939, upload-time = "2025-11-17T12:27:53.847Z" },
+    { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/f7/d26e62d9be3d70473a63e0a5d30bae49c2fe138bebac224adddcdef8a7ce/blis-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1e647341f958421a86b028a2efe16ce19c67dba2a05f79e8f7e80b1ff45328aa", size = 6928322, upload-time = "2025-11-17T12:27:57.965Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/78/750d12da388f714958eb2f2fd177652323bbe7ec528365c37129edd6eb84/blis-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d563160f874abb78a57e346f07312c5323f7ad67b6370052b6b17087ef234a8e", size = 1229635, upload-time = "2025-11-17T12:28:00.118Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/36/eac4199c5b200a5f3e93cad197da8d26d909f218eb444c4f552647c95240/blis-1.3.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:30b8a5b90cb6cb81d1ada9ae05aa55fb8e70d9a0ae9db40d2401bb9c1c8f14c4", size = 2815650, upload-time = "2025-11-17T12:28:02.544Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/51/472e7b36a6bedb5242a9757e7486f702c3619eff76e256735d0c8b1679c6/blis-1.3.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9f5c53b277f6ac5b3ca30bc12ebab7ea16c8f8c36b14428abb56924213dc127", size = 11359008, upload-time = "2025-11-17T12:28:04.589Z" },
+    { url = "https://files.pythonhosted.org/packages/84/da/d0dfb6d6e6321ae44df0321384c32c322bd07b15740d7422727a1a49fc5d/blis-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6297e7616c158b305c9a8a4e47ca5fc9b0785194dd96c903b1a1591a7ca21ddf", size = 3011959, upload-time = "2025-11-17T12:28:06.862Z" },
+    { url = "https://files.pythonhosted.org/packages/20/c5/2b0b5e556fa0364ed671051ea078a6d6d7b979b1cfef78d64ad3ca5f0c7f/blis-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3f966ca74f89f8a33e568b9a1d71992fc9a0d29a423e047f0a212643e21b5458", size = 14232456, upload-time = "2025-11-17T12:28:08.779Z" },
+    { url = "https://files.pythonhosted.org/packages/31/07/4cdc81a47bf862c0b06d91f1bc6782064e8b69ac9b5d4ff51d97e4ff03da/blis-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:7a0fc4b237a3a453bdc3c7ab48d91439fcd2d013b665c46948d9eaf9c3e45a97", size = 6192624, upload-time = "2025-11-17T12:28:14.197Z" },
+]
+
+[[package]]
+name = "boto3"
+version = "1.41.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore", marker = "python_full_version >= '3.11'" },
+    { name = "jmespath", marker = "python_full_version >= '3.11'" },
+    { name = "s3transfer", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/81/450cd4143864959264a3d80f9246175a20de8c1e50ec889c710eaa28cdd9/boto3-1.41.5.tar.gz", hash = "sha256:bc7806bee681dfdff2fe2b74967b107a56274f1e66ebe4d20dc8eee1ea408d17", size = 111594, upload-time = "2025-11-26T20:27:47.021Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3c/56/f47a80254ed4991cce9a2f6d8ae8aafbc8df1c3270e966b2927289e5a12f/boto3-1.41.5-py3-none-any.whl", hash = "sha256:bb278111bfb4c33dca8342bda49c9db7685e43debbfa00cc2a5eb854dd54b745", size = 139344, upload-time = "2025-11-26T20:27:45.571Z" },
+]
+
 [[package]]
 name = "botocore"
 version = "1.41.5"
@@ -545,28 +567,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" },
     { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" },
     { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" },
-    { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" },
-    { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" },
-    { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" },
-    { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" },
-    { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" },
 ]
 
 [[package]]
@@ -648,22 +648,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/89/66/c7a9e1b7429be72123441bfdbaf2bc13faab3f90b933f664db506dea5915/charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac", size = 99404, upload-time = "2025-10-14T04:41:29.95Z" },
     { url = "https://files.pythonhosted.org/packages/c4/26/b9924fa27db384bdcd97ab83b4f0a8058d96ad9626ead570674d5e737d90/charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14", size = 107092, upload-time = "2025-10-14T04:41:31.188Z" },
     { url = "https://files.pythonhosted.org/packages/af/8f/3ed4bfa0c0c72a7ca17f0380cd9e4dd842b09f664e780c13cff1dcf2ef1b/charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2", size = 100408, upload-time = "2025-10-14T04:41:32.624Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/35/7051599bd493e62411d6ede36fd5af83a38f37c4767b92884df7301db25d/charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd", size = 207746, upload-time = "2025-10-14T04:41:33.773Z" },
-    { url = "https://files.pythonhosted.org/packages/10/9a/97c8d48ef10d6cd4fcead2415523221624bf58bcf68a802721a6bc807c8f/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb", size = 147889, upload-time = "2025-10-14T04:41:34.897Z" },
-    { url = "https://files.pythonhosted.org/packages/10/bf/979224a919a1b606c82bd2c5fa49b5c6d5727aa47b4312bb27b1734f53cd/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e", size = 143641, upload-time = "2025-10-14T04:41:36.116Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/33/0ad65587441fc730dc7bd90e9716b30b4702dc7b617e6ba4997dc8651495/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14", size = 160779, upload-time = "2025-10-14T04:41:37.229Z" },
-    { url = "https://files.pythonhosted.org/packages/67/ed/331d6b249259ee71ddea93f6f2f0a56cfebd46938bde6fcc6f7b9a3d0e09/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191", size = 159035, upload-time = "2025-10-14T04:41:38.368Z" },
-    { url = "https://files.pythonhosted.org/packages/67/ff/f6b948ca32e4f2a4576aa129d8bed61f2e0543bf9f5f2b7fc3758ed005c9/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838", size = 152542, upload-time = "2025-10-14T04:41:39.862Z" },
-    { url = "https://files.pythonhosted.org/packages/16/85/276033dcbcc369eb176594de22728541a925b2632f9716428c851b149e83/charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6", size = 149524, upload-time = "2025-10-14T04:41:41.319Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/f2/6a2a1f722b6aba37050e626530a46a68f74e63683947a8acff92569f979a/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e", size = 150395, upload-time = "2025-10-14T04:41:42.539Z" },
-    { url = "https://files.pythonhosted.org/packages/60/bb/2186cb2f2bbaea6338cad15ce23a67f9b0672929744381e28b0592676824/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c", size = 143680, upload-time = "2025-10-14T04:41:43.661Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/a5/bf6f13b772fbb2a90360eb620d52ed8f796f3c5caee8398c3b2eb7b1c60d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090", size = 162045, upload-time = "2025-10-14T04:41:44.821Z" },
-    { url = "https://files.pythonhosted.org/packages/df/c5/d1be898bf0dc3ef9030c3825e5d3b83f2c528d207d246cbabe245966808d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152", size = 149687, upload-time = "2025-10-14T04:41:46.442Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/42/90c1f7b9341eef50c8a1cb3f098ac43b0508413f33affd762855f67a410e/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828", size = 160014, upload-time = "2025-10-14T04:41:47.631Z" },
-    { url = "https://files.pythonhosted.org/packages/76/be/4d3ee471e8145d12795ab655ece37baed0929462a86e72372fd25859047c/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec", size = 154044, upload-time = "2025-10-14T04:41:48.81Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/6f/8f7af07237c34a1defe7defc565a9bc1807762f672c0fde711a4b22bf9c0/charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9", size = 99940, upload-time = "2025-10-14T04:41:49.946Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/51/8ade005e5ca5b0d80fb4aff72a3775b325bdc3d27408c8113811a7cbe640/charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c", size = 107104, upload-time = "2025-10-14T04:41:51.051Z" },
-    { url = "https://files.pythonhosted.org/packages/da/5f/6b8f83a55bb8278772c5ae54a577f3099025f9ade59d0136ac24a0df4bde/charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2", size = 100743, upload-time = "2025-10-14T04:41:52.122Z" },
     { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
 ]
 
@@ -679,6 +663,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" },
 ]
 
+[[package]]
+name = "cloudpathlib"
+version = "0.24.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/06/19/58bc6b5d7d0f81c7209b05445af477e147c486552f96665a5912211839b9/cloudpathlib-0.24.0.tar.gz", hash = "sha256:c521a984e77b47e656fe78e20a7e3e260e0ab45fc69e33ac01094227c979e34a", size = 53600, upload-time = "2026-04-30T00:54:43.265Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/5b/ba933f896d9b0b07608d575a8501e2b4e32166b60d84c430a4a7285ebe64/cloudpathlib-0.24.0-py3-none-any.whl", hash = "sha256:b1c51e2d2ec7dc4fed6538991f4aea849d6cf11a7e6b9069f86e461aa1f9b5b4", size = 63214, upload-time = "2026-04-30T00:54:42.06Z" },
+]
+
 [[package]]
 name = "cloudpickle"
 version = "3.0.0"
@@ -714,16 +707,12 @@ name = "comment-parser"
 version = "1.2.4"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "python-magic", version = "0.4.24", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.13'" },
@@ -735,14 +724,12 @@ name = "comment-parser"
 version = "1.2.5"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
 ]
 dependencies = [
-    { name = "python-magic", version = "0.4.27", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.13.*'" },
+    { name = "python-magic", version = "0.4.27", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/37/6a/354d8e640b5f90996ac07c002189a552226a0ddaad85efd14863166aaa14/comment_parser-1.2.5.tar.gz", hash = "sha256:5606b769228cafce03d538e361472896581b386f3bc44bd62f4b61ff45ff05ec", size = 8852, upload-time = "2024-12-25T05:08:14.618Z" }
 wheels = [
@@ -750,36 +737,21 @@ wheels = [
 ]
 
 [[package]]
-name = "contextlib2"
-version = "21.6.0"
+name = "confection"
+version = "1.3.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c7/13/37ea7805ae3057992e96ecb1cffa2fa35c2ef4498543b846f90dd2348d8f/contextlib2-21.6.0.tar.gz", hash = "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869", size = 43795, upload-time = "2021-06-27T06:54:40.841Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ca/65/efd0fe8a936fc8ca2978cb7b82581fb20d901c6039e746a808f746b7647b/confection-1.3.3.tar.gz", hash = "sha256:f0f6810d567ff73993fe74d218ca5e1ffb6a44fb03f391257fc5d033546cbfaa", size = 54895, upload-time = "2026-03-24T18:45:24.331Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/76/56/6d6872f79d14c0cb02f1646cbb4592eef935857c0951a105874b7b62a0c3/contextlib2-21.6.0-py2.py3-none-any.whl", hash = "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f", size = 13277, upload-time = "2021-06-27T06:54:20.972Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/e4/d66708bdf0d92fb4d49b22cdff4b10cec38aca5dcd7e81d909bb55c65cd7/confection-1.3.3-py3-none-any.whl", hash = "sha256:b9fef9ee84b237ef4611ec3eb5797b70e13063e6310ad9f15536373f5e313c82", size = 35902, upload-time = "2026-03-24T18:45:22.664Z" },
 ]
 
 [[package]]
-name = "cosmos-xenna"
-version = "0.1.2"
+name = "contextlib2"
+version = "21.6.0"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "attrs", marker = "python_full_version >= '3.14'" },
-    { name = "cattrs", marker = "python_full_version >= '3.14'" },
-    { name = "jinja2", marker = "python_full_version >= '3.14'" },
-    { name = "loguru", marker = "python_full_version >= '3.14'" },
-    { name = "pulp", marker = "python_full_version >= '3.14'" },
-    { name = "ray", extra = ["default"], marker = "python_full_version >= '3.14'" },
-    { name = "tabulate", marker = "python_full_version >= '3.14'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1c/3f/73cb722613efe4e3e97414b629d55aeb1f4abbacf9490b489242e990499f/cosmos_xenna-0.1.2.tar.gz", hash = "sha256:7beac6fcb3bf771f3a72c1c3443460ee9ab22143962803374cf5b62b5d15d231", size = 299856, upload-time = "2025-08-19T19:35:26.806Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c7/13/37ea7805ae3057992e96ecb1cffa2fa35c2ef4498543b846f90dd2348d8f/contextlib2-21.6.0.tar.gz", hash = "sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869", size = 43795, upload-time = "2021-06-27T06:54:40.841Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/5f/91ec927391ce6e2c72f54e63071f3cdbff7a0a169f982aaa4cb253dc8c5d/cosmos_xenna-0.1.2-py3-none-any.whl", hash = "sha256:e22565a5e8f5939a77e37a06d77bd3fa2301034e8c4749053fe9c97009b9b3b8", size = 185830, upload-time = "2025-08-19T19:35:25.503Z" },
+    { url = "https://files.pythonhosted.org/packages/76/56/6d6872f79d14c0cb02f1646cbb4592eef935857c0951a105874b7b62a0c3/contextlib2-21.6.0-py2.py3-none-any.whl", hash = "sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f", size = 13277, upload-time = "2021-06-27T06:54:20.972Z" },
 ]
 
 [[package]]
@@ -812,32 +784,26 @@ name = "cosmos-xenna"
 version = "0.2.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-]
-dependencies = [
-    { name = "attrs", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "cattrs", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jinja2", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "loguru", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "obstore", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "portpicker", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pulp", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "ray", extra = ["default"], marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "tabulate", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "attrs", marker = "python_full_version >= '3.11'" },
+    { name = "cattrs", marker = "python_full_version >= '3.11'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "loguru", marker = "python_full_version >= '3.11'" },
+    { name = "obstore", marker = "python_full_version >= '3.11'" },
+    { name = "portpicker", marker = "python_full_version >= '3.11'" },
+    { name = "pulp", marker = "python_full_version >= '3.11'" },
+    { name = "ray", extra = ["default"], marker = "python_full_version >= '3.11'" },
+    { name = "tabulate", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0d/86/182317a8047c1597ae0c0fb43227814d8e1a09e4d63413e54b466ae2422f/cosmos_xenna-0.2.0.tar.gz", hash = "sha256:49f44c9fac39d83b9d78e1dd14271b1bffd4e34924ed5db6409025e52a27ddc9", size = 470618, upload-time = "2026-03-04T16:42:39.895Z" }
 wheels = [
@@ -915,32 +881,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/31/12a4aec689cb942a89129587860ed4d0fd522d5fda81237147fde554b8ae/coverage-7.13.0-cp313-cp313t-win32.whl", hash = "sha256:5f8a0297355e652001015e93be345ee54393e45dc3050af4a0475c5a2b767d46", size = 221505, upload-time = "2025-12-08T13:13:46.332Z" },
     { url = "https://files.pythonhosted.org/packages/65/8c/3b5fe3259d863572d2b0827642c50c3855d26b3aefe80bdc9eba1f0af3b0/coverage-7.13.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6abb3a4c52f05e08460bd9acf04fec027f8718ecaa0d09c40ffbc3fbd70ecc39", size = 222569, upload-time = "2025-12-08T13:13:47.79Z" },
     { url = "https://files.pythonhosted.org/packages/b0/39/f71fa8316a96ac72fc3908839df651e8eccee650001a17f2c78cdb355624/coverage-7.13.0-cp313-cp313t-win_arm64.whl", hash = "sha256:3ad968d1e3aa6ce5be295ab5fe3ae1bf5bb4769d0f98a80a0252d543a2ef2e9e", size = 220841, upload-time = "2025-12-08T13:13:49.243Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/4b/9b54bedda55421449811dcd5263a2798a63f48896c24dfb92b0f1b0845bd/coverage-7.13.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:453b7ec753cf5e4356e14fe858064e5520c460d3bbbcb9c35e55c0d21155c256", size = 218343, upload-time = "2025-12-08T13:13:50.811Z" },
-    { url = "https://files.pythonhosted.org/packages/59/df/c3a1f34d4bba2e592c8979f924da4d3d4598b0df2392fbddb7761258e3dc/coverage-7.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:af827b7cbb303e1befa6c4f94fd2bf72f108089cfa0f8abab8f4ca553cf5ca5a", size = 218672, upload-time = "2025-12-08T13:13:52.284Z" },
-    { url = "https://files.pythonhosted.org/packages/07/62/eec0659e47857698645ff4e6ad02e30186eb8afd65214fd43f02a76537cb/coverage-7.13.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:9987a9e4f8197a1000280f7cc089e3ea2c8b3c0a64d750537809879a7b4ceaf9", size = 249715, upload-time = "2025-12-08T13:13:53.791Z" },
-    { url = "https://files.pythonhosted.org/packages/23/2d/3c7ff8b2e0e634c1f58d095f071f52ed3c23ff25be524b0ccae8b71f99f8/coverage-7.13.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3188936845cd0cb114fa6a51842a304cdbac2958145d03be2377ec41eb285d19", size = 252225, upload-time = "2025-12-08T13:13:55.274Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/ac/fb03b469d20e9c9a81093575003f959cf91a4a517b783aab090e4538764b/coverage-7.13.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2bdb3babb74079f021696cb46b8bb5f5661165c385d3a238712b031a12355be", size = 253559, upload-time = "2025-12-08T13:13:57.161Z" },
-    { url = "https://files.pythonhosted.org/packages/29/62/14afa9e792383c66cc0a3b872a06ded6e4ed1079c7d35de274f11d27064e/coverage-7.13.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7464663eaca6adba4175f6c19354feea61ebbdd735563a03d1e472c7072d27bb", size = 249724, upload-time = "2025-12-08T13:13:58.692Z" },
-    { url = "https://files.pythonhosted.org/packages/31/b7/333f3dab2939070613696ab3ee91738950f0467778c6e5a5052e840646b7/coverage-7.13.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8069e831f205d2ff1f3d355e82f511eb7c5522d7d413f5db5756b772ec8697f8", size = 251582, upload-time = "2025-12-08T13:14:00.642Z" },
-    { url = "https://files.pythonhosted.org/packages/81/cb/69162bda9381f39b2287265d7e29ee770f7c27c19f470164350a38318764/coverage-7.13.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6fb2d5d272341565f08e962cce14cdf843a08ac43bd621783527adb06b089c4b", size = 249538, upload-time = "2025-12-08T13:14:02.556Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/76/350387b56a30f4970abe32b90b2a434f87d29f8b7d4ae40d2e8a85aacfb3/coverage-7.13.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:5e70f92ef89bac1ac8a99b3324923b4749f008fdbd7aa9cb35e01d7a284a04f9", size = 249349, upload-time = "2025-12-08T13:14:04.015Z" },
-    { url = "https://files.pythonhosted.org/packages/86/0d/7f6c42b8d59f4c7e43ea3059f573c0dcfed98ba46eb43c68c69e52ae095c/coverage-7.13.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4b5de7d4583e60d5fd246dd57fcd3a8aa23c6e118a8c72b38adf666ba8e7e927", size = 251011, upload-time = "2025-12-08T13:14:05.505Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/f1/4bb2dff379721bb0b5c649d5c5eaf438462cad824acf32eb1b7ca0c7078e/coverage-7.13.0-cp314-cp314-win32.whl", hash = "sha256:a6c6e16b663be828a8f0b6c5027d36471d4a9f90d28444aa4ced4d48d7d6ae8f", size = 221091, upload-time = "2025-12-08T13:14:07.127Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/44/c239da52f373ce379c194b0ee3bcc121020e397242b85f99e0afc8615066/coverage-7.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:0900872f2fdb3ee5646b557918d02279dc3af3dfb39029ac4e945458b13f73bc", size = 221904, upload-time = "2025-12-08T13:14:08.542Z" },
-    { url = "https://files.pythonhosted.org/packages/89/1f/b9f04016d2a29c2e4a0307baefefad1a4ec5724946a2b3e482690486cade/coverage-7.13.0-cp314-cp314-win_arm64.whl", hash = "sha256:3a10260e6a152e5f03f26db4a407c4c62d3830b9af9b7c0450b183615f05d43b", size = 220480, upload-time = "2025-12-08T13:14:10.958Z" },
-    { url = "https://files.pythonhosted.org/packages/16/d4/364a1439766c8e8647860584171c36010ca3226e6e45b1753b1b249c5161/coverage-7.13.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9097818b6cc1cfb5f174e3263eba4a62a17683bcfe5c4b5d07f4c97fa51fbf28", size = 219074, upload-time = "2025-12-08T13:14:13.345Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/f4/71ba8be63351e099911051b2089662c03d5671437a0ec2171823c8e03bec/coverage-7.13.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0018f73dfb4301a89292c73be6ba5f58722ff79f51593352759c1790ded1cabe", size = 219342, upload-time = "2025-12-08T13:14:15.02Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/25/127d8ed03d7711a387d96f132589057213e3aef7475afdaa303412463f22/coverage-7.13.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:166ad2a22ee770f5656e1257703139d3533b4a0b6909af67c6b4a3adc1c98657", size = 260713, upload-time = "2025-12-08T13:14:16.907Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/db/559fbb6def07d25b2243663b46ba9eb5a3c6586c0c6f4e62980a68f0ee1c/coverage-7.13.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f6aaef16d65d1787280943f1c8718dc32e9cf141014e4634d64446702d26e0ff", size = 262825, upload-time = "2025-12-08T13:14:18.68Z" },
-    { url = "https://files.pythonhosted.org/packages/37/99/6ee5bf7eff884766edb43bd8736b5e1c5144d0fe47498c3779326fe75a35/coverage-7.13.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e999e2dcc094002d6e2c7bbc1fb85b58ba4f465a760a8014d97619330cdbbbf3", size = 265233, upload-time = "2025-12-08T13:14:20.55Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/90/92f18fe0356ea69e1f98f688ed80cec39f44e9f09a1f26a1bbf017cc67f2/coverage-7.13.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:00c3d22cf6fb1cf3bf662aaaa4e563be8243a5ed2630339069799835a9cc7f9b", size = 259779, upload-time = "2025-12-08T13:14:22.367Z" },
-    { url = "https://files.pythonhosted.org/packages/90/5d/b312a8b45b37a42ea7d27d7d3ff98ade3a6c892dd48d1d503e773503373f/coverage-7.13.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:22ccfe8d9bb0d6134892cbe1262493a8c70d736b9df930f3f3afae0fe3ac924d", size = 262700, upload-time = "2025-12-08T13:14:24.309Z" },
-    { url = "https://files.pythonhosted.org/packages/63/f8/b1d0de5c39351eb71c366f872376d09386640840a2e09b0d03973d791e20/coverage-7.13.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:9372dff5ea15930fea0445eaf37bbbafbc771a49e70c0aeed8b4e2c2614cc00e", size = 260302, upload-time = "2025-12-08T13:14:26.068Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/7c/d42f4435bc40c55558b3109a39e2d456cddcec37434f62a1f1230991667a/coverage-7.13.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:69ac2c492918c2461bc6ace42d0479638e60719f2a4ef3f0815fa2df88e9f940", size = 259136, upload-time = "2025-12-08T13:14:27.604Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/d3/23413241dc04d47cfe19b9a65b32a2edd67ecd0b817400c2843ebc58c847/coverage-7.13.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:739c6c051a7540608d097b8e13c76cfa85263ced467168dc6b477bae3df7d0e2", size = 261467, upload-time = "2025-12-08T13:14:29.09Z" },
-    { url = "https://files.pythonhosted.org/packages/13/e6/6e063174500eee216b96272c0d1847bf215926786f85c2bd024cf4d02d2f/coverage-7.13.0-cp314-cp314t-win32.whl", hash = "sha256:fe81055d8c6c9de76d60c94ddea73c290b416e061d40d542b24a5871bad498b7", size = 221875, upload-time = "2025-12-08T13:14:31.106Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/46/f4fb293e4cbe3620e3ac2a3e8fd566ed33affb5861a9b20e3dd6c1896cbc/coverage-7.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:445badb539005283825959ac9fa4a28f712c214b65af3a2c464f1adc90f5fcbc", size = 222982, upload-time = "2025-12-08T13:14:33.1Z" },
-    { url = "https://files.pythonhosted.org/packages/68/62/5b3b9018215ed9733fbd1ae3b2ed75c5de62c3b55377a52cae732e1b7805/coverage-7.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:de7f6748b890708578fc4b7bb967d810aeb6fcc9bff4bb77dbca77dab2f9df6a", size = 221016, upload-time = "2025-12-08T13:14:34.601Z" },
     { url = "https://files.pythonhosted.org/packages/8d/4c/1968f32fb9a2604645827e11ff84a31e59d532e01995f904723b4f5328b3/coverage-7.13.0-py3-none-any.whl", hash = "sha256:850d2998f380b1e266459ca5b47bc9e7daf9af1d070f66317972f382d46f1904", size = 210068, upload-time = "2025-12-08T13:14:36.236Z" },
 ]
 
@@ -951,40 +891,48 @@ toml = [
 
 [[package]]
 name = "cryptography"
-version = "42.0.8"
+version = "48.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/93/a7/1498799a2ea06148463a9a2c10ab2f6a921a74fb19e231b27dc412a748e2/cryptography-42.0.8.tar.gz", hash = "sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2", size = 671250, upload-time = "2024-06-04T19:55:08.609Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f9/8b/1b929ba8139430e09e140e6939c2b29c18df1f2fc2149e41bdbdcdaf5d1f/cryptography-42.0.8-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e", size = 5899961, upload-time = "2024-06-04T19:53:57.933Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/5d/31d833daa800e4fab33209843095df7adb4a78ea536929145534cbc15026/cryptography-42.0.8-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d", size = 3114353, upload-time = "2024-06-04T19:54:12.171Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/32/f6326c70a9f0f258a201d3b2632bca586ea24d214cec3cf36e374040e273/cryptography-42.0.8-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902", size = 3647773, upload-time = "2024-06-04T19:54:07.051Z" },
-    { url = "https://files.pythonhosted.org/packages/35/66/2d87e9ca95c82c7ee5f2c09716fc4c4242c1ae6647b9bd27e55e920e9f10/cryptography-42.0.8-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801", size = 3839763, upload-time = "2024-06-04T19:54:30.383Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/de/8083fa2e68d403553a01a9323f4f8b9d7ffed09928ba25635c29fb28c1e7/cryptography-42.0.8-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949", size = 3632661, upload-time = "2024-06-04T19:54:32.955Z" },
-    { url = "https://files.pythonhosted.org/packages/07/40/d6f6819c62e808ea74639c3c640f7edd636b86cce62cb14943996a15df92/cryptography-42.0.8-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9", size = 3851536, upload-time = "2024-06-04T19:53:53.131Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/46/de71d48abf2b6d3c808f4fbb0f4dc44a4e72786be23df0541aa2a3f6fd7e/cryptography-42.0.8-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583", size = 3754209, upload-time = "2024-06-04T19:54:55.259Z" },
-    { url = "https://files.pythonhosted.org/packages/25/c9/86f04e150c5d5d5e4a731a2c1e0e43da84d901f388e3fea3d5de98d689a7/cryptography-42.0.8-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7", size = 3923551, upload-time = "2024-06-04T19:54:16.46Z" },
-    { url = "https://files.pythonhosted.org/packages/53/c2/903014dafb7271fb148887d4355b2e90319cad6e810663be622b0c933fc9/cryptography-42.0.8-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b", size = 3739265, upload-time = "2024-06-04T19:54:23.194Z" },
-    { url = "https://files.pythonhosted.org/packages/95/26/82d704d988a193cbdc69ac3b41c687c36eaed1642cce52530ad810c35645/cryptography-42.0.8-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7", size = 3937371, upload-time = "2024-06-04T19:55:04.303Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/71/4e0d05c9acd638a225f57fb6162aa3d03613c11b76893c23ea4675bb28c5/cryptography-42.0.8-cp37-abi3-win32.whl", hash = "sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2", size = 2438849, upload-time = "2024-06-04T19:54:27.39Z" },
-    { url = "https://files.pythonhosted.org/packages/06/0f/78da3cad74f2ba6c45321dc90394d70420ea846730dc042ef527f5a224b5/cryptography-42.0.8-cp37-abi3-win_amd64.whl", hash = "sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba", size = 2889090, upload-time = "2024-06-04T19:54:14.245Z" },
-    { url = "https://files.pythonhosted.org/packages/60/12/f064af29190cdb1d38fe07f3db6126091639e1dece7ec77c4ff037d49193/cryptography-42.0.8-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28", size = 5901232, upload-time = "2024-06-04T19:54:52.722Z" },
-    { url = "https://files.pythonhosted.org/packages/43/c2/4a3eef67e009a522711ebd8ac89424c3a7fe591ece7035d964419ad52a1d/cryptography-42.0.8-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e", size = 3648711, upload-time = "2024-06-04T19:54:44.323Z" },
-    { url = "https://files.pythonhosted.org/packages/49/1c/9f6d13cc8041c05eebff1154e4e71bedd1db8e174fff999054435994187a/cryptography-42.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70", size = 3841968, upload-time = "2024-06-04T19:54:57.911Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/f9/c3d4f19b82bdb25a3d857fe96e7e571c981810e47e3f299cc13ac429066a/cryptography-42.0.8-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c", size = 3633032, upload-time = "2024-06-04T19:54:48.518Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/e2/b7e6e8c261536c489d9cf908769880d94bd5d9a187e166b0dc838d2e6a56/cryptography-42.0.8-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7", size = 3852478, upload-time = "2024-06-04T19:54:50.599Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/68/e16751f6b859bc120f53fddbf3ebada5c34f0e9689d8af32884d8b2e4b4c/cryptography-42.0.8-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e", size = 3754102, upload-time = "2024-06-04T19:54:46.231Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/38/85c74d0ac4c540780e072b1e6f148ecb718418c1062edcb20d22f3ec5bbb/cryptography-42.0.8-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961", size = 3925042, upload-time = "2024-06-04T19:54:34.767Z" },
-    { url = "https://files.pythonhosted.org/packages/89/f4/a8b982e88eb5350407ebdbf4717b55043271d878705329e107f4783555f2/cryptography-42.0.8-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1", size = 3738833, upload-time = "2024-06-04T19:54:05.231Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/2b/be327b580645927bb1a1f32d5a175b897a9b956bc085b095e15c40bac9ed/cryptography-42.0.8-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14", size = 3938751, upload-time = "2024-06-04T19:54:37.837Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/d5/c6a78ffccdbe4516711ebaa9ed2c7eb6ac5dfa3dc920f2c7e920af2418b0/cryptography-42.0.8-cp39-abi3-win32.whl", hash = "sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c", size = 2439281, upload-time = "2024-06-04T19:53:55.903Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/7b/b0d330852dd5953daee6b15f742f15d9f18e9c0154eb4cfcc8718f0436da/cryptography-42.0.8-cp39-abi3-win_amd64.whl", hash = "sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a", size = 2886038, upload-time = "2024-06-04T19:54:18.707Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/fe/1e21699f0a7904e8a30d4fc6db262958f1edf5e505a02e7d97a5b419e482/cryptography-42.0.8-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe", size = 3014449, upload-time = "2024-06-04T19:54:40.379Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/f3/61b398b5ec61f4b6ffbf746227df7ebb421696458d9625d634043f236a13/cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c", size = 3558533, upload-time = "2024-06-04T19:54:42.123Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/e2/60b05e720766e185ef097d07068bd878a51d613ef91e4c241750f9c9192b/cryptography-42.0.8-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71", size = 3759330, upload-time = "2024-06-04T19:54:09.258Z" },
-    { url = "https://files.pythonhosted.org/packages/10/38/2c8dae407d301eaf942e377a5b2b30485cfa0df03c6c2dcc2ac044054ed9/cryptography-42.0.8-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d", size = 2801764, upload-time = "2024-06-04T19:54:25.455Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/9f/a9/db8f313fdcd85d767d4973515e1db101f9c71f95fced83233de224673757/cryptography-48.0.0.tar.gz", hash = "sha256:5c3932f4436d1cccb036cb0eaef46e6e2db91035166f1ad6505c3c9d5a635920", size = 832984, upload-time = "2026-05-04T22:59:38.133Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/3d/01f6dd9190170a5a241e0e98c2d04be3664a9e6f5b9b872cde63aff1c3dd/cryptography-48.0.0-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:0c558d2cdffd8f4bbb30fc7134c74d2ca9a476f830bb053074498fbc86f41ed6", size = 8001587, upload-time = "2026-05-04T22:57:36.803Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/6e/e90527eef33f309beb811cf7c982c3aeffcce8e3edb178baa4ca3ae4a6fa/cryptography-48.0.0-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f5333311663ea94f75dd408665686aaf426563556bb5283554a3539177e03b8c", size = 4690433, upload-time = "2026-05-04T22:57:40.373Z" },
+    { url = "https://files.pythonhosted.org/packages/90/04/673510ed51ddff56575f306cf1617d80411ee76831ccd3097599140efdfe/cryptography-48.0.0-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7995ef305d7165c3f11ae07f2517e5a4f1d5c18da1376a0a9ed496336b69e5f3", size = 4710620, upload-time = "2026-05-04T22:57:42.935Z" },
+    { url = "https://files.pythonhosted.org/packages/14/d5/e9c4ef932c8d800490c34d8bd589d64a31d5890e27ec9e9ad532be893294/cryptography-48.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:40ba1f85eaa6959837b1d51c9767e230e14612eea4ef110ee8854ada22da1bf5", size = 4696283, upload-time = "2026-05-04T22:57:45.294Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/29/174b9dfb60b12d59ecfc6cfa04bc88c21b42a54f01b8aae09bb6e51e4c7f/cryptography-48.0.0-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:369a6348999f94bbd53435c894377b20ab95f25a9065c283570e70150d8abc3c", size = 5296573, upload-time = "2026-05-04T22:57:47.933Z" },
+    { url = "https://files.pythonhosted.org/packages/95/38/0d29a6fd7d0d1373f0c0c88a04ba20e359b257753ac497564cd660fc1d55/cryptography-48.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a0e692c683f4df67815a2d258b324e66f4738bd7a96a218c826dce4f4bd05d8f", size = 4743677, upload-time = "2026-05-04T22:57:50.067Z" },
+    { url = "https://files.pythonhosted.org/packages/30/be/eef653013d5c63b6a490529e0316f9ac14a37602965d4903efed1399f32b/cryptography-48.0.0-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:18349bbc56f4743c8b12dc32e2bccb2cf83ee8b69a3bba74ef8ae857e26b3d25", size = 4330808, upload-time = "2026-05-04T22:57:52.301Z" },
+    { url = "https://files.pythonhosted.org/packages/84/9e/500463e87abb7a0a0f9f256ec21123ecde0a7b5541a15e840ea54551fd81/cryptography-48.0.0-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:7e8eac43dfca5c4cccc6dad9a80504436fca53bb9bc3100a2386d730fbe6b602", size = 4695941, upload-time = "2026-05-04T22:57:54.603Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/dc/7303087450c2ec9e7fbb750e17c2abfbc658f23cbd0e54009509b7cc4091/cryptography-48.0.0-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:9ccdac7d40688ecb5a3b4a604b8a88c8002e3442d6c60aead1db2a89a041560c", size = 5252579, upload-time = "2026-05-04T22:57:57.207Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/c0/7101d3b7215edcdc90c45da544961fd8ed2d6448f77577460fa75a8443f7/cryptography-48.0.0-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:bd72e68b06bb1e96913f97dd4901119bc17f39d4586a5adf2d3e47bc2b9d58b5", size = 4743326, upload-time = "2026-05-04T22:57:59.535Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/d8/5b833bad13016f562ab9d063d68199a4bd121d18458e439515601d3357ec/cryptography-48.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:59baa2cb386c4f0b9905bd6eb4c2a79a69a128408fd31d32ca4d7102d4156321", size = 4826672, upload-time = "2026-05-04T22:58:01.996Z" },
+    { url = "https://files.pythonhosted.org/packages/98/e1/7074eb8bf3c135558c73fc2bcf0f5633f912e6fb87e868a55c454080ef09/cryptography-48.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9249e3cd978541d665967ac2cb2787fd6a62bddf1e75b3e347a594d7dacf4f74", size = 4972574, upload-time = "2026-05-04T22:58:03.968Z" },
+    { url = "https://files.pythonhosted.org/packages/04/70/e5a1b41d325f797f39427aa44ef8baf0be500065ab6d8e10369d850d4a4f/cryptography-48.0.0-cp311-abi3-win32.whl", hash = "sha256:9c459db21422be75e2809370b829a87eb37f74cd785fc4aa9ea1e5f43b47cda4", size = 3294868, upload-time = "2026-05-04T22:58:06.467Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/ac/8ac51b4a5fc5932eb7ee5c517ba7dc8cd834f0048962b6b352f00f41ebf9/cryptography-48.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:5b012212e08b8dd5edc78ef54da83dd9892fd9105323b3993eff6bea65dc21d7", size = 3817107, upload-time = "2026-05-04T22:58:08.845Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/63/61d4a4e1c6b6bab6ce1e213cd36a24c415d90e76d78c5eb8577c5541d2e8/cryptography-48.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:58d00498e8933e4a194f3076aee1b4a97dfec1a6da444535755822fe5d8b0b86", size = 7983482, upload-time = "2026-05-04T22:58:43.769Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/ac/f5b5995b87770c693e2596559ffafe195b4033a57f14a82268a2842953f3/cryptography-48.0.0-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:614d0949f4790582d2cc25553abd09dd723025f0c0e7c67376a1d77196743d6e", size = 4683266, upload-time = "2026-05-04T22:58:46.064Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/c6/8b14f67e18338fbc4adb76f66c001f5c3610b3e2d1837f268f47a347dbbb/cryptography-48.0.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7ce4bfae76319a532a2dc68f82cc32f5676ee792a983187dac07183690e5c66f", size = 4696228, upload-time = "2026-05-04T22:58:48.22Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/73/f808fbae9514bd91b47875b003f13e284c8c6bdfd904b7944e803937eec1/cryptography-48.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2eb992bbd4661238c5a397594c83f5b4dc2bc5b848c365c8f991b6780efcc5c7", size = 4689097, upload-time = "2026-05-04T22:58:50.9Z" },
+    { url = "https://files.pythonhosted.org/packages/93/01/d86632d7d28db8ae83221995752eeb6639ffb374c2d22955648cf8d52797/cryptography-48.0.0-cp39-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:22a5cb272895dce158b2cacdfdc3debd299019659f42947dbdac6f32d68fe832", size = 5283582, upload-time = "2026-05-04T22:58:53.017Z" },
+    { url = "https://files.pythonhosted.org/packages/02/e1/50edc7a50334807cc4791fc4a0ce7468b4a1416d9138eab358bfc9a3d70b/cryptography-48.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2b4d59804e8408e2fea7d1fbaf218e5ec984325221db76e6a241a9abd6cdd95c", size = 4730479, upload-time = "2026-05-04T22:58:55.611Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/af/99a582b1b1641ff5911ac559beb45097cf79efd4ead4657f578ef1af2d47/cryptography-48.0.0-cp39-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:984a20b0f62a26f48a3396c72e4bc34c66e356d356bf370053066b3b6d54634a", size = 4326481, upload-time = "2026-05-04T22:58:57.607Z" },
+    { url = "https://files.pythonhosted.org/packages/90/ee/89aa26a06ef0a7d7611788ffd571a7c50e368cc6a4d5eef8b4884e866edb/cryptography-48.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:5a5ed8fde7a1d09376ca0b40e68cd59c69fe23b1f9768bd5824f54681626032a", size = 4688713, upload-time = "2026-05-04T22:59:00.077Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ba/bcb1b0bb7a33d4c7c0c4d4c7874b4a62ae4f56113a5f4baefa362dfb1f0f/cryptography-48.0.0-cp39-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:8cd666227ef7af430aa5914a9910e0ddd703e75f039cef0825cd0da71b6b711a", size = 5238165, upload-time = "2026-05-04T22:59:02.317Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/70/ca4003b1ce5ca3dc3186ada51908c8a9b9ff7d5cab83cc0d43ee14ec144f/cryptography-48.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9071196d81abc88b3516ac8cdfad32e2b66dd4a5393a8e68a961e9161ddc6239", size = 4729947, upload-time = "2026-05-04T22:59:05.255Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a0/4ec7cf774207905aef1a8d11c3750d5a1db805eb380ee4e16df317870128/cryptography-48.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1e2d54c8be6152856a36f0882ab231e70f8ec7f14e93cf87db8a2ed056bf160c", size = 4822059, upload-time = "2026-05-04T22:59:07.802Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/75/a2e55f99c16fcac7b5d6c1eb19ad8e00799854d6be5ca845f9259eae1681/cryptography-48.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a5da777e32ffed6f85a7b2b3f7c5cbc88c146bfcd0a1d7baf5fcc6c52ee35dd4", size = 4960575, upload-time = "2026-05-04T22:59:09.851Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/23/6e6f32143ab5d8b36ca848a502c4bcd477ae75b9e1677e3530d669062578/cryptography-48.0.0-cp39-abi3-win32.whl", hash = "sha256:77a2ccbbe917f6710e05ba9adaa25fb5075620bf3ea6fb751997875aff4ae4bd", size = 3279117, upload-time = "2026-05-04T22:59:12.019Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/9a/0fea98a70cf1749d41d738836f6349d97945f7c89433a259a6c2642eefeb/cryptography-48.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:16cd65b9330583e4619939b3a3843eec1e6e789744bb01e7c7e2e62e33c239c8", size = 3792100, upload-time = "2026-05-04T22:59:14.884Z" },
+    { url = "https://files.pythonhosted.org/packages/be/d2/024b5e06be9d44cb021fb0e1a03d34d63989cf56a0fe62f3dfbab695b9b4/cryptography-48.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:84cf79f0dc8b36ac5da873481716e87aef31fcfa0444f9e1d8b4b2cece142855", size = 3950391, upload-time = "2026-05-04T22:59:17.415Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/17/3861e17c56fa0fd37491a14a8673fdb77c57fc5693cafe745ea8b06dba75/cryptography-48.0.0-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:fdfef35d751d510fcef5252703621574364fec16418c4a1e5e1055248401054b", size = 4637126, upload-time = "2026-05-04T22:59:20.197Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/0a/7e226dbff530f21480727eb764973a7bff2b912f8e15cd4f129e71b56d1d/cryptography-48.0.0-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:0890f502ddf7d9c6426129c3f49f5c0a39278ed7cd6322c8755ffca6ee675a13", size = 4667270, upload-time = "2026-05-04T22:59:22.647Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/f2/5a72274ca9f1b2a8b44a662ee0bf1b435909deb473d6f97bcd035bcdbc71/cryptography-48.0.0-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:ecde28a596bead48b0cfd2a1b4416c3d43074c2d785e3a398d7ec1fc4d0f7fbb", size = 4636797, upload-time = "2026-05-04T22:59:24.912Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/e1/48cedb2fe63626e91ded1edad159e2a4fb8b6906c4425eb7749673077ce7/cryptography-48.0.0-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:4defde8685ae324a9eb9d818717e93b4638ef67070ac9bc15b8ca85f63048355", size = 4666800, upload-time = "2026-05-04T22:59:27.474Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/ca/7e8365deec19afb2b2c7be7c1c0aa8f99633b54e90c570999acda93260fc/cryptography-48.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:db63bf618e5dea46c07de12e900fe1cdd2541e6dc9dbae772a70b7d4d4765f6a", size = 3739536, upload-time = "2026-05-04T22:59:29.61Z" },
 ]
 
 [[package]]
@@ -992,7 +940,7 @@ name = "cuda-bindings"
 version = "12.9.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-pathfinder", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "cuda-pathfinder", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/7a/d8/b546104b8da3f562c1ff8ab36d130c8fe1dd6a045ced80b4f6ad74f7d4e1/cuda_bindings-12.9.4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d3c842c2a4303b2a580fe955018e31aea30278be19795ae05226235268032e5", size = 12148218, upload-time = "2025-10-21T14:51:28.855Z" },
@@ -1000,8 +948,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" },
     { url = "https://files.pythonhosted.org/packages/63/56/e465c31dc9111be3441a9ba7df1941fe98f4aa6e71e8788a3fb4534ce24d/cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:32bdc5a76906be4c61eb98f546a6786c5773a881f3b166486449b5d141e4a39f", size = 11906628, upload-time = "2025-10-21T14:51:49.905Z" },
     { url = "https://files.pythonhosted.org/packages/a3/84/1e6be415e37478070aeeee5884c2022713c1ecc735e6d82d744de0252eee/cuda_bindings-12.9.4-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56e0043c457a99ac473ddc926fe0dc4046694d99caef633e92601ab52cbe17eb", size = 11925991, upload-time = "2025-10-21T14:51:56.535Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/af/6dfd8f2ed90b1d4719bc053ff8940e494640fe4212dc3dd72f383e4992da/cuda_bindings-12.9.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8b72ee72a9cc1b531db31eebaaee5c69a8ec3500e32c6933f2d3b15297b53686", size = 11922703, upload-time = "2025-10-21T14:52:03.585Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/19/90ac264acc00f6df8a49378eedec9fd2db3061bf9263bf9f39fd3d8377c3/cuda_bindings-12.9.4-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d80bffc357df9988dca279734bc9674c3934a654cab10cadeed27ce17d8635ee", size = 11924658, upload-time = "2025-10-21T14:52:10.411Z" },
 ]
 
 [[package]]
@@ -1144,7 +1090,54 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/37/f0/0f1d79c0c7fccbc2ed0c0ff3be1b0562be60b764c729ca8ded1bd6d953aa/cupy_cuda12x-14.0.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:bfbde2e9f7946021b49414f9c800991163f2a56a1318f3d7d69cbb06001a1585", size = 135080693, upload-time = "2026-02-20T10:22:35.843Z" },
     { url = "https://files.pythonhosted.org/packages/5a/a6/944406223a190815d9df156a1d66f3b0352bd8827dc4a8c752196d616dbc/cupy_cuda12x-14.0.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:9f0c81c3509f77be3ae8444759d5b314201b2dfcbbf2ae0d0b5fb7a61f20893c", size = 134613763, upload-time = "2026-02-20T10:22:56.792Z" },
     { url = "https://files.pythonhosted.org/packages/80/53/037c931731151c504cfc00069eb295c903927c92145115623f13bd2ea076/cupy_cuda12x-14.0.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:21fcb4e917e43237edcc5e3a1a1241e2a2946ba9e577ce36fd580bd9856f91e8", size = 134227269, upload-time = "2026-02-20T10:23:16.147Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/73/34e5f334f6b1e5c5dff80af8109979fb0e8461b27e4454517e0e47486455/cupy_cuda12x-14.0.1-cp314-cp314-manylinux2014_x86_64.whl", hash = "sha256:fa356384760e01498d010af2d96de536ef3dad19db1d3a1ad0764e4323fb919f", size = 133521354, upload-time = "2026-02-20T10:23:37.063Z" },
+]
+
+[[package]]
+name = "cymem"
+version = "2.0.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/2f0fbb32535c3731b7c2974c569fb9325e0a38ed5565a08e1139a3b71e82/cymem-2.0.13.tar.gz", hash = "sha256:1c91a92ae8c7104275ac26bd4d29b08ccd3e7faff5893d3858cb6fadf1bc1588", size = 12320, upload-time = "2025-11-14T14:58:36.902Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5d/14/462018dd384ee1848ac9c1951534a813a325abbfc161a74e2cbcb38d2469/cymem-2.0.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8efc4f308169237aade0e82877a65a563833dec32eb7ab2326120253e0e9e918", size = 43747, upload-time = "2025-11-14T14:57:11.287Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/9b/c123ba65dddcd8a2bc0b3c9046766c15abe0e257c315b3040eed22cce1e2/cymem-2.0.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e03bb575a96c59bc210d7d59862747f0012696b0dac3427ce8af33c7afb3d4a2", size = 43328, upload-time = "2025-11-14T14:57:12.578Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/be/7b7a4cf9cd2d37e674612a86fc90b3d59bff12177f83430e62b25afaf7fc/cymem-2.0.13-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1775d3fd34cf099929b79c3e48469283642463f977af6801231f3c0e5d9c9369", size = 231539, upload-time = "2025-11-14T14:57:14.441Z" },
+    { url = "https://files.pythonhosted.org/packages/79/6d/d165c38cd4caaaf60942e2cec9998b667008f2384047ccfe0b4b5f7a1ffe/cymem-2.0.13-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:84e2976e38cd663f758e40b5497fa5cd183d7c5fb0d04ce81a4b42a1ba124ff0", size = 229674, upload-time = "2025-11-14T14:57:15.685Z" },
+    { url = "https://files.pythonhosted.org/packages/95/c1/af83c03a93f890ca81149561b18a4a67a9aa36a1109f15e291dd2703ab12/cymem-2.0.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed9de1b9b042f76fe5c312e4359eab58bf52ac7dfdf6887368a760410d809440", size = 229805, upload-time = "2025-11-14T14:57:17.289Z" },
+    { url = "https://files.pythonhosted.org/packages/03/2d/12900758b80345d9aed5892a9d61e8a5f6abbbe5837e4def373a53cd0da2/cymem-2.0.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1366c7437a209230f4b797fae10227a8206d4021d37c9f9c0d31fd97ea4feb35", size = 234018, upload-time = "2025-11-14T14:57:18.512Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/8b/5fcf5430fc81098aef58cc20340e51f37b49b9d8c15766e0d5d63e7288a3/cymem-2.0.13-cp310-cp310-win_amd64.whl", hash = "sha256:7700b116524b087e0169f10f267539223b48240ef2734c3a727a9e6b4db9a671", size = 40102, upload-time = "2025-11-14T14:57:19.972Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/d3/cb6c83758fe399443b858faafb7096b72535621a7af7dd9a54ff0989fa14/cymem-2.0.13-cp310-cp310-win_arm64.whl", hash = "sha256:c8dbfddfe5c604974e17c6f373cedd4d25cd67f84812ede7dea12128fa0c2015", size = 36282, upload-time = "2025-11-14T14:57:21.398Z" },
+    { url = "https://files.pythonhosted.org/packages/10/64/1db41f7576a6b69f70367e3c15e968fd775ba7419e12059c9966ceb826f8/cymem-2.0.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:673183466b0ff2e060d97ec5116711d44200b8f7be524323e080d215ee2d44a5", size = 43587, upload-time = "2025-11-14T14:57:22.39Z" },
+    { url = "https://files.pythonhosted.org/packages/81/13/57f936fc08551323aab3f92ff6b7f4d4b89d5b4e495c870a67cb8d279757/cymem-2.0.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bee2791b3f6fc034ce41268851462bf662ff87e8947e35fb6dd0115b4644a61f", size = 43139, upload-time = "2025-11-14T14:57:23.363Z" },
+    { url = "https://files.pythonhosted.org/packages/32/a6/9345754be51e0479aa387b7b6cffc289d0fd3201aaeb8dade4623abd1e02/cymem-2.0.13-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f3aee3adf16272bca81c5826eed55ba3c938add6d8c9e273f01c6b829ecfde22", size = 245063, upload-time = "2025-11-14T14:57:24.839Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/01/6bc654101526fa86e82bf6b05d99b2cd47c30a333cfe8622c26c0592beb2/cymem-2.0.13-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:30c4e75a3a1d809e89106b0b21803eb78e839881aa1f5b9bd27b454bc73afde3", size = 244496, upload-time = "2025-11-14T14:57:26.42Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/fb/853b7b021e701a1f41687f3704d5f469aeb2a4f898c3fbb8076806885955/cymem-2.0.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec99efa03cf8ec11c8906aa4d4cc0c47df393bc9095c9dd64b89b9b43e220b04", size = 243287, upload-time = "2025-11-14T14:57:27.542Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/2b/0e4664cafc581de2896d75000651fd2ce7094d33263f466185c28ffc96e4/cymem-2.0.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c90a6ecba994a15b17a3f45d7ec74d34081df2f73bd1b090e2adc0317e4e01b6", size = 248287, upload-time = "2025-11-14T14:57:29.055Z" },
+    { url = "https://files.pythonhosted.org/packages/21/0f/f94c6950edbfc2aafb81194fc40b6cacc8e994e9359d3cb4328c5705b9b5/cymem-2.0.13-cp311-cp311-win_amd64.whl", hash = "sha256:ce821e6ba59148ed17c4567113b8683a6a0be9c9ac86f14e969919121efb61a5", size = 40116, upload-time = "2025-11-14T14:57:30.592Z" },
+    { url = "https://files.pythonhosted.org/packages/00/df/2455eff6ac0381ff165db6883b311f7016e222e3dd62185517f8e8187ed0/cymem-2.0.13-cp311-cp311-win_arm64.whl", hash = "sha256:0dca715e708e545fd1d97693542378a00394b20a37779c1ae2c8bdbb43acef79", size = 36349, upload-time = "2025-11-14T14:57:31.573Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/52/478a2911ab5028cb710b4900d64aceba6f4f882fcb13fd8d40a456a1b6dc/cymem-2.0.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:e8afbc5162a0fe14b6463e1c4e45248a1b2fe2cbcecc8a5b9e511117080da0eb", size = 43745, upload-time = "2025-11-14T14:57:32.52Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/71/f0f8adee945524774b16af326bd314a14a478ed369a728a22834e6785a18/cymem-2.0.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9251d889348fe79a75e9b3e4d1b5fa651fca8a64500820685d73a3acc21b6a8", size = 42927, upload-time = "2025-11-14T14:57:33.827Z" },
+    { url = "https://files.pythonhosted.org/packages/62/6d/159780fe162ff715d62b809246e5fc20901cef87ca28b67d255a8d741861/cymem-2.0.13-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:742fc19764467a49ed22e56a4d2134c262d73a6c635409584ae3bf9afa092c33", size = 258346, upload-time = "2025-11-14T14:57:34.917Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/12/678d16f7aa1996f947bf17b8cfb917ea9c9674ef5e2bd3690c04123d5680/cymem-2.0.13-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f190a92fe46197ee64d32560eb121c2809bb843341733227f51538ce77b3410d", size = 260843, upload-time = "2025-11-14T14:57:36.503Z" },
+    { url = "https://files.pythonhosted.org/packages/31/5d/0dd8c167c08cd85e70d274b7235cfe1e31b3cebc99221178eaf4bbb95c6f/cymem-2.0.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d670329ee8dbbbf241b7c08069fe3f1d3a1a3e2d69c7d05ea008a7010d826298", size = 254607, upload-time = "2025-11-14T14:57:38.036Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/c9/d6514a412a1160aa65db539836b3d47f9b59f6675f294ec34ae32f867c82/cymem-2.0.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a84ba3178d9128b9ffb52ce81ebab456e9fe959125b51109f5b73ebdfc6b60d6", size = 262421, upload-time = "2025-11-14T14:57:39.265Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/fe/3ee37d02ca4040f2fb22d34eb415198f955862b5dd47eee01df4c8f5454c/cymem-2.0.13-cp312-cp312-win_amd64.whl", hash = "sha256:2ff1c41fd59b789579fdace78aa587c5fc091991fa59458c382b116fc36e30dc", size = 40176, upload-time = "2025-11-14T14:57:40.706Z" },
+    { url = "https://files.pythonhosted.org/packages/94/fb/1b681635bfd5f2274d0caa8f934b58435db6c091b97f5593738065ddb786/cymem-2.0.13-cp312-cp312-win_arm64.whl", hash = "sha256:6bbd701338df7bf408648191dff52472a9b334f71bcd31a21a41d83821050f67", size = 35959, upload-time = "2025-11-14T14:57:41.682Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/0f/95a4d1e3bebfdfa7829252369357cf9a764f67569328cd9221f21e2c952e/cymem-2.0.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:891fd9030293a8b652dc7fb9fdc79a910a6c76fc679cd775e6741b819ffea476", size = 43478, upload-time = "2025-11-14T14:57:42.682Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/a0/8fc929cc29ae466b7b4efc23ece99cbd3ea34992ccff319089c624d667fd/cymem-2.0.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:89c4889bd16513ce1644ccfe1e7c473ba7ca150f0621e66feac3a571bde09e7e", size = 42695, upload-time = "2025-11-14T14:57:43.741Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/b3/deeb01354ebaf384438083ffe0310209ef903db3e7ba5a8f584b06d28387/cymem-2.0.13-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:45dcaba0f48bef9cc3d8b0b92058640244a95a9f12542210b51318da97c2cf28", size = 250573, upload-time = "2025-11-14T14:57:44.81Z" },
+    { url = "https://files.pythonhosted.org/packages/36/36/bc980b9a14409f3356309c45a8d88d58797d02002a9d794dd6c84e809d3a/cymem-2.0.13-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e96848faaafccc0abd631f1c5fb194eac0caee4f5a8777fdbb3e349d3a21741c", size = 254572, upload-time = "2025-11-14T14:57:46.023Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/dd/a12522952624685bd0f8968e26d2ed6d059c967413ce6eb52292f538f1b0/cymem-2.0.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e02d3e2c3bfeb21185d5a4a70790d9df40629a87d8d7617dc22b4e864f665fa3", size = 248060, upload-time = "2025-11-14T14:57:47.605Z" },
+    { url = "https://files.pythonhosted.org/packages/08/11/5dc933ddfeb2dfea747a0b935cb965b9a7580b324d96fc5f5a1b5ff8df29/cymem-2.0.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fece5229fd5ecdcd7a0738affb8c59890e13073ae5626544e13825f26c019d3c", size = 254601, upload-time = "2025-11-14T14:57:48.861Z" },
+    { url = "https://files.pythonhosted.org/packages/70/66/d23b06166864fa94e13a98e5922986ce774832936473578febce64448d75/cymem-2.0.13-cp313-cp313-win_amd64.whl", hash = "sha256:38aefeb269597c1a0c2ddf1567dd8605489b661fa0369c6406c1acd433b4c7ba", size = 40103, upload-time = "2025-11-14T14:57:50.396Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/9e/c7b21271ab88a21760f3afdec84d2bc09ffa9e6c8d774ad9d4f1afab0416/cymem-2.0.13-cp313-cp313-win_arm64.whl", hash = "sha256:717270dcfd8c8096b479c42708b151002ff98e434a7b6f1f916387a6c791e2ad", size = 36016, upload-time = "2025-11-14T14:57:51.611Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/28/d3b03427edc04ae04910edf1c24b993881c3ba93a9729a42bcbb816a1808/cymem-2.0.13-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7e1a863a7f144ffb345397813701509cfc74fc9ed360a4d92799805b4b865dd1", size = 46429, upload-time = "2025-11-14T14:57:52.582Z" },
+    { url = "https://files.pythonhosted.org/packages/35/a9/7ed53e481f47ebfb922b0b42e980cec83e98ccb2137dc597ea156642440c/cymem-2.0.13-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c16cb80efc017b054f78998c6b4b013cef509c7b3d802707ce1f85a1d68361bf", size = 46205, upload-time = "2025-11-14T14:57:53.64Z" },
+    { url = "https://files.pythonhosted.org/packages/61/39/a3d6ad073cf7f0fbbb8bbf09698c3c8fac11be3f791d710239a4e8dd3438/cymem-2.0.13-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0d78a27c88b26c89bd1ece247d1d5939dba05a1dae6305aad8fd8056b17ddb51", size = 296083, upload-time = "2025-11-14T14:57:55.922Z" },
+    { url = "https://files.pythonhosted.org/packages/36/0c/20697c8bc19f624a595833e566f37d7bcb9167b0ce69de896eba7cfc9c2d/cymem-2.0.13-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6d36710760f817194dacb09d9fc45cb6a5062ed75e85f0ef7ad7aeeb13d80cc3", size = 286159, upload-time = "2025-11-14T14:57:57.106Z" },
+    { url = "https://files.pythonhosted.org/packages/82/d4/9326e3422d1c2d2b4a8fb859bdcce80138f6ab721ddafa4cba328a505c71/cymem-2.0.13-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c8f30971cadd5dcf73bcfbbc5849b1f1e1f40db8cd846c4aa7d3b5e035c7b583", size = 288186, upload-time = "2025-11-14T14:57:58.334Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/bc/68da7dd749b72884dc22e898562f335002d70306069d496376e5ff3b6153/cymem-2.0.13-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9d441d0e45798ec1fd330373bf7ffa6b795f229275f64016b6a193e6e2a51522", size = 290353, upload-time = "2025-11-14T14:58:00.562Z" },
+    { url = "https://files.pythonhosted.org/packages/50/23/dbf2ad6ecd19b99b3aab6203b1a06608bbd04a09c522d836b854f2f30f73/cymem-2.0.13-cp313-cp313t-win_amd64.whl", hash = "sha256:d1c950eebb9f0f15e3ef3591313482a5a611d16fc12d545e2018cd607f40f472", size = 44764, upload-time = "2025-11-14T14:58:01.793Z" },
+    { url = "https://files.pythonhosted.org/packages/54/3f/35701c13e1fc7b0895198c8b20068c569a841e0daf8e0b14d1dc0816b28f/cymem-2.0.13-cp313-cp313t-win_arm64.whl", hash = "sha256:042e8611ef862c34a97b13241f5d0da86d58aca3cecc45c533496678e75c5a1f", size = 38964, upload-time = "2025-11-14T14:58:02.87Z" },
 ]
 
 [[package]]
@@ -1206,10 +1199,10 @@ name = "data-designer"
 version = "0.5.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "data-designer-config", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "data-designer-engine", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "prompt-toolkit", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "typer", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "data-designer-config", marker = "python_full_version >= '3.11'" },
+    { name = "data-designer-engine", marker = "python_full_version >= '3.11'" },
+    { name = "prompt-toolkit", marker = "python_full_version >= '3.11'" },
+    { name = "typer", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/85/71/a290f6dee45d54ffd0492ce6fcf12a7a0cb9ab56afd105437ca73aa88674/data_designer-0.5.5.tar.gz", hash = "sha256:ebb3c20229434e58e2911dd09c36d6bc2c66a847bf4dd03be37d06d6074dec69", size = 117433, upload-time = "2026-04-02T16:29:26.664Z" }
 wheels = [
@@ -1221,18 +1214,18 @@ name = "data-designer-config"
 version = "0.5.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "jinja2", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'linux'" },
-    { name = "pandas", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pillow", version = "12.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pydantic", extra = ["email"], marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pygments", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "python-json-logger", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pyyaml", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "rich", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+    { name = "pandas", marker = "python_full_version >= '3.11'" },
+    { name = "pillow", version = "12.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", extra = ["email"], marker = "python_full_version >= '3.11'" },
+    { name = "pygments", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "python-json-logger", marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "rich", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d0/85/0ec87e05b628d4f8558f5addd355e6c7be054d94b38d866d45f4efc9918e/data_designer_config-0.5.5.tar.gz", hash = "sha256:f8930c073c6fb10fbe8f344845af7d02f291345ba7833b36f93f72c1fa02c191", size = 124320, upload-time = "2026-04-02T16:29:19.006Z" }
 wheels = [
@@ -1244,26 +1237,26 @@ name = "data-designer-engine"
 version = "0.5.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyascii", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "chardet", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "data-designer-config", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "duckdb", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "faker", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "httpx", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "httpx-retries", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "huggingface-hub", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "json-repair", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jsonpath-rust-bindings", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jsonschema", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "lxml", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "marko", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "mcp", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "ruff", version = "0.15.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "scipy", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "sqlfluff", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "tiktoken", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "anyascii", marker = "python_full_version >= '3.11'" },
+    { name = "chardet", marker = "python_full_version >= '3.11'" },
+    { name = "data-designer-config", marker = "python_full_version >= '3.11'" },
+    { name = "duckdb", marker = "python_full_version >= '3.11'" },
+    { name = "faker", marker = "python_full_version >= '3.11'" },
+    { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
+    { name = "httpx-retries", marker = "python_full_version >= '3.11'" },
+    { name = "huggingface-hub", marker = "python_full_version >= '3.11'" },
+    { name = "json-repair", marker = "python_full_version >= '3.11'" },
+    { name = "jsonpath-rust-bindings", marker = "python_full_version >= '3.11'" },
+    { name = "jsonschema", marker = "python_full_version >= '3.11'" },
+    { name = "lxml", marker = "python_full_version >= '3.11'" },
+    { name = "marko", marker = "python_full_version >= '3.11'" },
+    { name = "mcp", marker = "python_full_version >= '3.11'" },
+    { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "ruff", version = "0.15.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "scipy", marker = "python_full_version >= '3.11'" },
+    { name = "sqlfluff", marker = "python_full_version >= '3.11'" },
+    { name = "tiktoken", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/13/3e/5f9c7e42605f2ed3247c1365a0d872a26c597d83da0dac8d641648c6aa39/data_designer_engine-0.5.5.tar.gz", hash = "sha256:9889c71eaa039dbbc9d11b1ff93859db61fa878cee45f42aa08d30e0c4efdd37", size = 757125, upload-time = "2026-04-02T16:29:23.761Z" }
 wheels = [
@@ -1275,25 +1268,15 @@ name = "datasets"
 version = "4.0.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "dill", version = "0.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -1301,8 +1284,8 @@ dependencies = [
     { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, extra = ["http"], marker = "python_full_version >= '3.11'" },
     { name = "huggingface-hub", marker = "python_full_version >= '3.11'" },
     { name = "multiprocess", version = "0.70.16", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "packaging", marker = "python_full_version >= '3.11'" },
     { name = "pandas", marker = "python_full_version >= '3.11'" },
     { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -1371,10 +1354,10 @@ name = "diff-cover"
 version = "10.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "chardet", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jinja2", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pluggy", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pygments", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "chardet", marker = "python_full_version >= '3.11'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "pluggy", marker = "python_full_version >= '3.11'" },
+    { name = "pygments", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/99/b4/eee71d1e338bc1f9bd3539b46b70e303dac061324b759c9a80fa3c96d90d/diff_cover-10.2.0.tar.gz", hash = "sha256:61bf83025f10510c76ef6a5820680cf61b9b974e8f81de70c57ac926fa63872a", size = 102473, upload-time = "2026-01-09T01:59:07.605Z" }
 wheels = [
@@ -1386,25 +1369,15 @@ name = "dill"
 version = "0.3.8"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/17/4d/ac7ffa80c69ea1df30a8aa11b3578692a5118e7cd1aa157e3ef73b092d15/dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca", size = 184847, upload-time = "2024-01-27T23:42:16.145Z" }
 wheels = [
@@ -1557,13 +1530,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bd/21/d903cc63a5140c822b7b62b373a87dc557e60c29b321dfb435061c5e67cf/duckdb-1.5.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70631c847ca918ee710ec874241b00cf9d2e5be90762cbb2a0389f17823c08f7", size = 21429837, upload-time = "2026-04-13T11:29:41.135Z" },
     { url = "https://files.pythonhosted.org/packages/e3/0a/b770d1f60c70597302130d6247f418549b7094251a02348fbaf1c7e147ae/duckdb-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:52a21823f3fbb52f0f0e5425e20b07391ad882464b955879499b5ff0b45a376b", size = 13107699, upload-time = "2026-04-13T11:29:43.905Z" },
     { url = "https://files.pythonhosted.org/packages/d9/cf/e200fe431d700962d1a908d2ce89f53ccee1cc8db260174ae663ba09686b/duckdb-1.5.2-cp313-cp313-win_arm64.whl", hash = "sha256:411ad438bd4140f189a10e7f515781335962c5d18bd07837dc6d202e3985253d", size = 13927646, upload-time = "2026-04-13T11:29:46.598Z" },
-    { url = "https://files.pythonhosted.org/packages/83/a1/f6286c67726cc1ea60a6e3c0d9fbc66527dde24ae089a51bbe298b13ca78/duckdb-1.5.2-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:6b0fe75c148000f060aa1a27b293cacc0ea08cc1cad724fbf2143d56070a3785", size = 30078598, upload-time = "2026-04-13T11:29:49.828Z" },
-    { url = "https://files.pythonhosted.org/packages/de/6a/59febb02f21a4a5c6b0b0099ef7c965fdd5e61e4904cf813809bb792e35f/duckdb-1.5.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:35579b8e3a064b5eaf15b0eafc558056a13f79a0a62e34cc4baf57119daecfec", size = 15975120, upload-time = "2026-04-13T11:29:52.631Z" },
-    { url = "https://files.pythonhosted.org/packages/09/70/ce750854d37bb5a45cccbb2c3cb04df4af56aea8fc30a2499bb643b4a9c0/duckdb-1.5.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ea58ff5b0880593a280cf5511734b17711b32ee1f58b47d726e8600848358160", size = 14227762, upload-time = "2026-04-13T11:29:55.564Z" },
-    { url = "https://files.pythonhosted.org/packages/28/dc/ad45ac3c0b6c4687dc649e8f6cf01af1c8b0443932a39b2abb4ebcb3babd/duckdb-1.5.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef461bca07313412dc09961c4a4757a851f56b95ac01c58fac6007632b7b94f2", size = 19315668, upload-time = "2026-04-13T11:29:58.427Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/b1/1464f468d2e5813f5808de95df9d3113a645a5bfa2ffcaecbc542ddae272/duckdb-1.5.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be37680ddb380015cb37318e378c53511c45c4f0d8fac5599d22b7d092b9217a", size = 21434056, upload-time = "2026-04-13T11:30:01.238Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/32/6673607e024722473fa7aafdd29c0e3dd231dd528f6cd8b5797fbeeb229d/duckdb-1.5.2-cp314-cp314-win_amd64.whl", hash = "sha256:0b291786014df1133f8f18b9df4d004484613146e858d71a21791e0fcca16cf4", size = 13633667, upload-time = "2026-04-13T11:30:04.05Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/e3/9d34173ec068631faea3ea6e73050700729363e7e33306a9a3218e5cdc61/duckdb-1.5.2-cp314-cp314-win_arm64.whl", hash = "sha256:c9f3e0b71b8a50fccfb42794899285d9d318ce2503782b9dd54868e5ecd0ad31", size = 14402513, upload-time = "2026-04-13T11:30:06.609Z" },
 ]
 
 [[package]]
@@ -1571,8 +1537,8 @@ name = "email-validator"
 version = "2.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dnspython", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "idna", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "dnspython", marker = "python_full_version >= '3.11'" },
+    { name = "idna", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f5/22/900cb125c76b7aaa450ce02fd727f452243f2e91a61af068b40adba60ea9/email_validator-2.3.0.tar.gz", hash = "sha256:9fc05c37f2f6cf439ff414f8fc46d917929974a82244c20eb10231ba60c54426", size = 51238, upload-time = "2025-08-26T13:09:06.831Z" }
 wheels = [
@@ -1611,7 +1577,7 @@ name = "faker"
 version = "20.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "python-dateutil", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "python-dateutil", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/65/24/54413530c8924a6a1bf75c3670f0911707fc7435017b57b4f57be098e469/Faker-20.1.0.tar.gz", hash = "sha256:562a3a09c3ed3a1a7b20e13d79f904dfdfc5e740f72813ecf95e4cf71e5a2f52", size = 1701717, upload-time = "2023-11-20T18:09:20.272Z" }
 wheels = [
@@ -1657,6 +1623,23 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" },
 ]
 
+[[package]]
+name = "flask"
+version = "3.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blinker" },
+    { name = "click" },
+    { name = "itsdangerous" },
+    { name = "jinja2" },
+    { name = "markupsafe" },
+    { name = "werkzeug" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/00/35d85dcce6c57fdc871f3867d465d780f302a175ea360f62533f12b27e2b/flask-3.1.3.tar.gz", hash = "sha256:0ef0e52b8a9cd932855379197dd8f94047b359ca0a78695144304cb45f87c9eb", size = 759004, upload-time = "2026-02-19T05:00:57.678Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/9c/34f6962f9b9e9c71f6e5ed806e0d0ff03c9d1b0b2340088a0cf4bce09b18/flask-3.1.3-py3-none-any.whl", hash = "sha256:f4bcbefc124291925f1a26446da31a5178f9483862233b23c0c96a20701f670c", size = 103424, upload-time = "2026-02-19T05:00:56.027Z" },
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.8.0"
@@ -1743,38 +1726,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fd/00/04ca1c3a7a124b6de4f8a9a17cc2fcad138b4608e7a3fc5877804b8715d7/frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b", size = 43492, upload-time = "2025-10-06T05:37:04.915Z" },
     { url = "https://files.pythonhosted.org/packages/59/5e/c69f733a86a94ab10f68e496dc6b7e8bc078ebb415281d5698313e3af3a1/frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888", size = 48034, upload-time = "2025-10-06T05:37:06.343Z" },
     { url = "https://files.pythonhosted.org/packages/16/6c/be9d79775d8abe79b05fa6d23da99ad6e7763a1d080fbae7290b286093fd/frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042", size = 41749, upload-time = "2025-10-06T05:37:07.431Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/c8/85da824b7e7b9b6e7f7705b2ecaf9591ba6f79c1177f324c2735e41d36a2/frozenlist-1.8.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cee686f1f4cadeb2136007ddedd0aaf928ab95216e7691c63e50a8ec066336d0", size = 86127, upload-time = "2025-10-06T05:37:08.438Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/e8/a1185e236ec66c20afd72399522f142c3724c785789255202d27ae992818/frozenlist-1.8.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:119fb2a1bd47307e899c2fac7f28e85b9a543864df47aa7ec9d3c1b4545f096f", size = 49698, upload-time = "2025-10-06T05:37:09.48Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/93/72b1736d68f03fda5fdf0f2180fb6caaae3894f1b854d006ac61ecc727ee/frozenlist-1.8.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4970ece02dbc8c3a92fcc5228e36a3e933a01a999f7094ff7c23fbd2beeaa67c", size = 49749, upload-time = "2025-10-06T05:37:10.569Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/b2/fabede9fafd976b991e9f1b9c8c873ed86f202889b864756f240ce6dd855/frozenlist-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cba69cb73723c3f329622e34bdbf5ce1f80c21c290ff04256cff1cd3c2036ed2", size = 231298, upload-time = "2025-10-06T05:37:11.993Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/3b/d9b1e0b0eed36e70477ffb8360c49c85c8ca8ef9700a4e6711f39a6e8b45/frozenlist-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:778a11b15673f6f1df23d9586f83c4846c471a8af693a22e066508b77d201ec8", size = 232015, upload-time = "2025-10-06T05:37:13.194Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/94/be719d2766c1138148564a3960fc2c06eb688da592bdc25adcf856101be7/frozenlist-1.8.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0325024fe97f94c41c08872db482cf8ac4800d80e79222c6b0b7b162d5b13686", size = 225038, upload-time = "2025-10-06T05:37:14.577Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/09/6712b6c5465f083f52f50cf74167b92d4ea2f50e46a9eea0523d658454ae/frozenlist-1.8.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:97260ff46b207a82a7567b581ab4190bd4dfa09f4db8a8b49d1a958f6aa4940e", size = 240130, upload-time = "2025-10-06T05:37:15.781Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/d4/cd065cdcf21550b54f3ce6a22e143ac9e4836ca42a0de1022da8498eac89/frozenlist-1.8.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:54b2077180eb7f83dd52c40b2750d0a9f175e06a42e3213ce047219de902717a", size = 242845, upload-time = "2025-10-06T05:37:17.037Z" },
-    { url = "https://files.pythonhosted.org/packages/62/c3/f57a5c8c70cd1ead3d5d5f776f89d33110b1addae0ab010ad774d9a44fb9/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2f05983daecab868a31e1da44462873306d3cbfd76d1f0b5b69c473d21dbb128", size = 229131, upload-time = "2025-10-06T05:37:18.221Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/52/232476fe9cb64f0742f3fde2b7d26c1dac18b6d62071c74d4ded55e0ef94/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:33f48f51a446114bc5d251fb2954ab0164d5be02ad3382abcbfe07e2531d650f", size = 240542, upload-time = "2025-10-06T05:37:19.771Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/85/07bf3f5d0fb5414aee5f47d33c6f5c77bfe49aac680bfece33d4fdf6a246/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:154e55ec0655291b5dd1b8731c637ecdb50975a2ae70c606d100750a540082f7", size = 237308, upload-time = "2025-10-06T05:37:20.969Z" },
-    { url = "https://files.pythonhosted.org/packages/11/99/ae3a33d5befd41ac0ca2cc7fd3aa707c9c324de2e89db0e0f45db9a64c26/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:4314debad13beb564b708b4a496020e5306c7333fa9a3ab90374169a20ffab30", size = 238210, upload-time = "2025-10-06T05:37:22.252Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/60/b1d2da22f4970e7a155f0adde9b1435712ece01b3cd45ba63702aea33938/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:073f8bf8becba60aa931eb3bc420b217bb7d5b8f4750e6f8b3be7f3da85d38b7", size = 231972, upload-time = "2025-10-06T05:37:23.5Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/ab/945b2f32de889993b9c9133216c068b7fcf257d8595a0ac420ac8677cab0/frozenlist-1.8.0-cp314-cp314-win32.whl", hash = "sha256:bac9c42ba2ac65ddc115d930c78d24ab8d4f465fd3fc473cdedfccadb9429806", size = 40536, upload-time = "2025-10-06T05:37:25.581Z" },
-    { url = "https://files.pythonhosted.org/packages/59/ad/9caa9b9c836d9ad6f067157a531ac48b7d36499f5036d4141ce78c230b1b/frozenlist-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:3e0761f4d1a44f1d1a47996511752cf3dcec5bbdd9cc2b4fe595caf97754b7a0", size = 44330, upload-time = "2025-10-06T05:37:26.928Z" },
-    { url = "https://files.pythonhosted.org/packages/82/13/e6950121764f2676f43534c555249f57030150260aee9dcf7d64efda11dd/frozenlist-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:d1eaff1d00c7751b7c6662e9c5ba6eb2c17a2306ba5e2a37f24ddf3cc953402b", size = 40627, upload-time = "2025-10-06T05:37:28.075Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/c7/43200656ecc4e02d3f8bc248df68256cd9572b3f0017f0a0c4e93440ae23/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d3bb933317c52d7ea5004a1c442eef86f426886fba134ef8cf4226ea6ee1821d", size = 89238, upload-time = "2025-10-06T05:37:29.373Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/29/55c5f0689b9c0fb765055629f472c0de484dcaf0acee2f7707266ae3583c/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8009897cdef112072f93a0efdce29cd819e717fd2f649ee3016efd3cd885a7ed", size = 50738, upload-time = "2025-10-06T05:37:30.792Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/7d/b7282a445956506fa11da8c2db7d276adcbf2b17d8bb8407a47685263f90/frozenlist-1.8.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2c5dcbbc55383e5883246d11fd179782a9d07a986c40f49abe89ddf865913930", size = 51739, upload-time = "2025-10-06T05:37:32.127Z" },
-    { url = "https://files.pythonhosted.org/packages/62/1c/3d8622e60d0b767a5510d1d3cf21065b9db874696a51ea6d7a43180a259c/frozenlist-1.8.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:39ecbc32f1390387d2aa4f5a995e465e9e2f79ba3adcac92d68e3e0afae6657c", size = 284186, upload-time = "2025-10-06T05:37:33.21Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/14/aa36d5f85a89679a85a1d44cd7a6657e0b1c75f61e7cad987b203d2daca8/frozenlist-1.8.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92db2bf818d5cc8d9c1f1fc56b897662e24ea5adb36ad1f1d82875bd64e03c24", size = 292196, upload-time = "2025-10-06T05:37:36.107Z" },
-    { url = "https://files.pythonhosted.org/packages/05/23/6bde59eb55abd407d34f77d39a5126fb7b4f109a3f611d3929f14b700c66/frozenlist-1.8.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2dc43a022e555de94c3b68a4ef0b11c4f747d12c024a520c7101709a2144fb37", size = 273830, upload-time = "2025-10-06T05:37:37.663Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/3f/22cff331bfad7a8afa616289000ba793347fcd7bc275f3b28ecea2a27909/frozenlist-1.8.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb89a7f2de3602cfed448095bab3f178399646ab7c61454315089787df07733a", size = 294289, upload-time = "2025-10-06T05:37:39.261Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/89/5b057c799de4838b6c69aa82b79705f2027615e01be996d2486a69ca99c4/frozenlist-1.8.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:33139dc858c580ea50e7e60a1b0ea003efa1fd42e6ec7fdbad78fff65fad2fd2", size = 300318, upload-time = "2025-10-06T05:37:43.213Z" },
-    { url = "https://files.pythonhosted.org/packages/30/de/2c22ab3eb2a8af6d69dc799e48455813bab3690c760de58e1bf43b36da3e/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:168c0969a329b416119507ba30b9ea13688fafffac1b7822802537569a1cb0ef", size = 282814, upload-time = "2025-10-06T05:37:45.337Z" },
-    { url = "https://files.pythonhosted.org/packages/59/f7/970141a6a8dbd7f556d94977858cfb36fa9b66e0892c6dd780d2219d8cd8/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:28bd570e8e189d7f7b001966435f9dac6718324b5be2990ac496cf1ea9ddb7fe", size = 291762, upload-time = "2025-10-06T05:37:46.657Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/15/ca1adae83a719f82df9116d66f5bb28bb95557b3951903d39135620ef157/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b2a095d45c5d46e5e79ba1e5b9cb787f541a8dee0433836cea4b96a2c439dcd8", size = 289470, upload-time = "2025-10-06T05:37:47.946Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/83/dca6dc53bf657d371fbc88ddeb21b79891e747189c5de990b9dfff2ccba1/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:eab8145831a0d56ec9c4139b6c3e594c7a83c2c8be25d5bcf2d86136a532287a", size = 289042, upload-time = "2025-10-06T05:37:49.499Z" },
-    { url = "https://files.pythonhosted.org/packages/96/52/abddd34ca99be142f354398700536c5bd315880ed0a213812bc491cff5e4/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:974b28cf63cc99dfb2188d8d222bc6843656188164848c4f679e63dae4b0708e", size = 283148, upload-time = "2025-10-06T05:37:50.745Z" },
-    { url = "https://files.pythonhosted.org/packages/af/d3/76bd4ed4317e7119c2b7f57c3f6934aba26d277acc6309f873341640e21f/frozenlist-1.8.0-cp314-cp314t-win32.whl", hash = "sha256:342c97bf697ac5480c0a7ec73cd700ecfa5a8a40ac923bd035484616efecc2df", size = 44676, upload-time = "2025-10-06T05:37:52.222Z" },
-    { url = "https://files.pythonhosted.org/packages/89/76/c615883b7b521ead2944bb3480398cbb07e12b7b4e4d073d3752eb721558/frozenlist-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:06be8f67f39c8b1dc671f5d83aaefd3358ae5cdcf8314552c57e7ed3e6475bdd", size = 49451, upload-time = "2025-10-06T05:37:53.425Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/a3/5982da14e113d07b325230f95060e2169f5311b1017ea8af2a29b374c289/frozenlist-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:102e6314ca4da683dca92e3b1355490fed5f313b768500084fbe6371fddfdb79", size = 42507, upload-time = "2025-10-06T05:37:54.513Z" },
     { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" },
 ]
 
@@ -1783,25 +1734,15 @@ name = "fsspec"
 version = "2025.3.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/34/f4/5721faf47b8c499e776bc34c6a8fc17efdf7fdef0b00f398128bc5dcb4ac/fsspec-2025.3.0.tar.gz", hash = "sha256:a935fd1ea872591f2b5148907d103488fc523295e6c64b835cfad8c3eca44972", size = 298491, upload-time = "2025-03-07T21:47:56.461Z" }
 wheels = [
@@ -1836,25 +1777,15 @@ name = "gcsfs"
 version = "2025.3.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "aiohttp", marker = "python_full_version >= '3.11'" },
@@ -1906,14 +1837,14 @@ wheels = [
 
 [[package]]
 name = "gitpython"
-version = "3.1.45"
+version = "3.1.50"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "gitdb" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9a/c8/dd58967d119baab745caec2f9d853297cec1989ec1d63f677d3880632b88/gitpython-3.1.45.tar.gz", hash = "sha256:85b0ee964ceddf211c41b9f27a49086010a190fd8132a24e21f362a4b36a791c", size = 215076, upload-time = "2025-07-24T03:45:54.871Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/33/f6/354ae6491228b5eb40e10d89c4d13c651fe1cf7556e35ebdded50cff57ce/gitpython-3.1.50.tar.gz", hash = "sha256:80da2d12504d52e1f998772dc5baf6e553f8d2fcfe1fcc226c9d9a2ee3372dcc", size = 219798, upload-time = "2026-05-06T04:01:26.571Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
+    { url = "https://files.pythonhosted.org/packages/20/7a/1c6e3562dfd8950adbb11ffbc65d21e7c89d01a6e4f137fa981056de25c5/gitpython-3.1.50-py3-none-any.whl", hash = "sha256:d352abe2908d07355014abdd21ddf798c2a961469239afec4962e9da884858f9", size = 212507, upload-time = "2026-05-06T04:01:23.799Z" },
 ]
 
 [[package]]
@@ -1933,6 +1864,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ed/d4/90197b416cb61cefd316964fd9e7bd8324bcbafabf40eef14a9f20b81974/google_api_core-2.28.1-py3-none-any.whl", hash = "sha256:4021b0f8ceb77a6fb4de6fde4502cecab45062e66ff4f2895169e0b35bc9466c", size = 173706, upload-time = "2025-10-28T21:34:50.151Z" },
 ]
 
+[package.optional-dependencies]
+grpc = [
+    { name = "grpcio", marker = "python_full_version >= '3.11'" },
+    { name = "grpcio-status", marker = "python_full_version >= '3.11'" },
+]
+
 [[package]]
 name = "google-auth"
 version = "2.43.0"
@@ -1991,6 +1928,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ae/ef/3b57bf617ee0c79450c1ff211d1eb888db8fc1050ac74b3e52cc6ed86e63/google_cloud_storage-3.6.0-py3-none-any.whl", hash = "sha256:5decbdddd63b7d1fc3e266a393ad6453d2e27d172bd982b1e2f15481668db097", size = 299039, upload-time = "2025-11-17T10:18:27.66Z" },
 ]
 
+[[package]]
+name = "google-cloud-translate"
+version = "3.26.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-api-core", extra = ["grpc"], marker = "python_full_version >= '3.11'" },
+    { name = "google-auth", marker = "python_full_version >= '3.11'" },
+    { name = "google-cloud-core", marker = "python_full_version >= '3.11'" },
+    { name = "grpc-google-iam-v1", marker = "python_full_version >= '3.11'" },
+    { name = "grpcio", marker = "python_full_version >= '3.11'" },
+    { name = "proto-plus", marker = "python_full_version >= '3.11'" },
+    { name = "protobuf", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/45/52/3bc1f9b87511eb3b4d28c4d7da65f1f4ecc407df46e56cff0eae43b5d875/google_cloud_translate-3.26.0.tar.gz", hash = "sha256:74c4c302ea705daa1d7dda095288fd736bb616f943480db34314cd0cd52cc9dd", size = 276633, upload-time = "2026-04-10T00:41:26.769Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/17/c2/50ed19071e57002ee1bb328e3fcfe43d71aafd9ab0b1e4a107d6c4d3c79d/google_cloud_translate-3.26.0-py3-none-any.whl", hash = "sha256:5b9f4d2cfdc41fcd357cda47d4d880acb6d720be7f0b8fdf95f2816dc982359d", size = 210892, upload-time = "2026-04-10T00:41:12.339Z" },
+]
+
 [[package]]
 name = "google-crc32c"
 version = "1.7.1"
@@ -2050,6 +2005,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" },
 ]
 
+[package.optional-dependencies]
+grpc = [
+    { name = "grpcio", marker = "python_full_version >= '3.11'" },
+]
+
 [[package]]
 name = "graphviz"
 version = "0.21"
@@ -2059,6 +2019,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42", size = 47300, upload-time = "2025-06-15T09:35:04.433Z" },
 ]
 
+[[package]]
+name = "grpc-google-iam-v1"
+version = "0.14.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos", extra = ["grpc"], marker = "python_full_version >= '3.11'" },
+    { name = "grpcio", marker = "python_full_version >= '3.11'" },
+    { name = "protobuf", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/44/4f/d098419ad0bfc06c9ce440575f05aa22d8973b6c276e86ac7890093d3c37/grpc_google_iam_v1-0.14.4.tar.gz", hash = "sha256:392b3796947ed6334e61171d9ab06bf7eb357f554e5fc7556ad7aab6d0e17038", size = 23706, upload-time = "2026-04-01T01:57:49.813Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/22/c2dd50c09bf679bd38173656cd4402d2511e563b33bc88f90009cf50613c/grpc_google_iam_v1-0.14.4-py3-none-any.whl", hash = "sha256:412facc320fcbd94034b4df3d557662051d4d8adfa86e0ddb4dca70a3f739964", size = 32675, upload-time = "2026-04-01T01:57:47.69Z" },
+]
+
 [[package]]
 name = "grpcio"
 version = "1.76.0"
@@ -2108,16 +2082,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/74/fd3317be5672f4856bcdd1a9e7b5e17554692d3db9a3b273879dc02d657d/grpcio-1.76.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:931091142fd8cc14edccc0845a79248bc155425eee9a98b2db2ea4f00a235a42", size = 7589983, upload-time = "2025-10-21T16:22:07.881Z" },
     { url = "https://files.pythonhosted.org/packages/45/bb/ca038cf420f405971f19821c8c15bcbc875505f6ffadafe9ffd77871dc4c/grpcio-1.76.0-cp313-cp313-win32.whl", hash = "sha256:5e8571632780e08526f118f74170ad8d50fb0a48c23a746bef2a6ebade3abd6f", size = 3984727, upload-time = "2025-10-21T16:22:10.032Z" },
     { url = "https://files.pythonhosted.org/packages/41/80/84087dc56437ced7cdd4b13d7875e7439a52a261e3ab4e06488ba6173b0a/grpcio-1.76.0-cp313-cp313-win_amd64.whl", hash = "sha256:f9f7bd5faab55f47231ad8dba7787866b69f5e93bc306e3915606779bbfb4ba8", size = 4702799, upload-time = "2025-10-21T16:22:12.709Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/46/39adac80de49d678e6e073b70204091e76631e03e94928b9ea4ecf0f6e0e/grpcio-1.76.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:ff8a59ea85a1f2191a0ffcc61298c571bc566332f82e5f5be1b83c9d8e668a62", size = 5808417, upload-time = "2025-10-21T16:22:15.02Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/f5/a4531f7fb8b4e2a60b94e39d5d924469b7a6988176b3422487be61fe2998/grpcio-1.76.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06c3d6b076e7b593905d04fdba6a0525711b3466f43b3400266f04ff735de0cd", size = 11828219, upload-time = "2025-10-21T16:22:17.954Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/1c/de55d868ed7a8bd6acc6b1d6ddc4aa36d07a9f31d33c912c804adb1b971b/grpcio-1.76.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fd5ef5932f6475c436c4a55e4336ebbe47bd3272be04964a03d316bbf4afbcbc", size = 6367826, upload-time = "2025-10-21T16:22:20.721Z" },
-    { url = "https://files.pythonhosted.org/packages/59/64/99e44c02b5adb0ad13ab3adc89cb33cb54bfa90c74770f2607eea629b86f/grpcio-1.76.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:b331680e46239e090f5b3cead313cc772f6caa7d0fc8de349337563125361a4a", size = 7049550, upload-time = "2025-10-21T16:22:23.637Z" },
-    { url = "https://files.pythonhosted.org/packages/43/28/40a5be3f9a86949b83e7d6a2ad6011d993cbe9b6bd27bea881f61c7788b6/grpcio-1.76.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2229ae655ec4e8999599469559e97630185fdd53ae1e8997d147b7c9b2b72cba", size = 6575564, upload-time = "2025-10-21T16:22:26.016Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/a9/1be18e6055b64467440208a8559afac243c66a8b904213af6f392dc2212f/grpcio-1.76.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:490fa6d203992c47c7b9e4a9d39003a0c2bcc1c9aa3c058730884bbbb0ee9f09", size = 7176236, upload-time = "2025-10-21T16:22:28.362Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/55/dba05d3fcc151ce6e81327541d2cc8394f442f6b350fead67401661bf041/grpcio-1.76.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:479496325ce554792dba6548fae3df31a72cef7bad71ca2e12b0e58f9b336bfc", size = 8125795, upload-time = "2025-10-21T16:22:31.075Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/45/122df922d05655f63930cf42c9e3f72ba20aadb26c100ee105cad4ce4257/grpcio-1.76.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1c9b93f79f48b03ada57ea24725d83a30284a012ec27eab2cf7e50a550cbbbcc", size = 7592214, upload-time = "2025-10-21T16:22:33.831Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/6e/0b899b7f6b66e5af39e377055fb4a6675c9ee28431df5708139df2e93233/grpcio-1.76.0-cp314-cp314-win32.whl", hash = "sha256:747fa73efa9b8b1488a95d0ba1039c8e2dca0f741612d80415b1e1c560febf4e", size = 4062961, upload-time = "2025-10-21T16:22:36.468Z" },
-    { url = "https://files.pythonhosted.org/packages/19/41/0b430b01a2eb38ee887f88c1f07644a1df8e289353b78e82b37ef988fb64/grpcio-1.76.0-cp314-cp314-win_amd64.whl", hash = "sha256:922fa70ba549fce362d2e2871ab542082d66e2aaf0c19480ea453905b01f384e", size = 4834462, upload-time = "2025-10-21T16:22:39.772Z" },
+]
+
+[[package]]
+name = "grpcio-status"
+version = "1.76.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos", marker = "python_full_version >= '3.11'" },
+    { name = "grpcio", marker = "python_full_version >= '3.11'" },
+    { name = "protobuf", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3f/46/e9f19d5be65e8423f886813a2a9d0056ba94757b0c5007aa59aed1a961fa/grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd", size = 13679, upload-time = "2025-10-21T16:28:52.545Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8c/cc/27ba60ad5a5f2067963e6a858743500df408eb5855e98be778eaef8c9b02/grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18", size = 14425, upload-time = "2025-10-21T16:28:40.853Z" },
 ]
 
 [[package]]
@@ -2159,13 +2137,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4a/1d/04513e3cab8f29ab8c109d309ddd21a2705afab9d52f2ba1151e0c14f086/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6de1fc44f58f6dd937956c8d304d8c2dea264c80680bcfa61ca4a15e7b76780f", size = 3408448, upload-time = "2025-10-24T19:04:20.951Z" },
     { url = "https://files.pythonhosted.org/packages/f0/7c/60a2756d7feec7387db3a1176c632357632fbe7849fce576c5559d4520c7/hf_xet-1.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f182f264ed2acd566c514e45da9f2119110e48a87a327ca271027904c70c5832", size = 3503401, upload-time = "2025-10-24T19:04:22.549Z" },
     { url = "https://files.pythonhosted.org/packages/4e/64/48fffbd67fb418ab07451e4ce641a70de1c40c10a13e25325e24858ebe5a/hf_xet-1.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:293a7a3787e5c95d7be1857358a9130694a9c6021de3f27fa233f37267174382", size = 2900866, upload-time = "2025-10-24T19:04:33.461Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/51/f7e2caae42f80af886db414d4e9885fac959330509089f97cccb339c6b87/hf_xet-1.2.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:10bfab528b968c70e062607f663e21e34e2bba349e8038db546646875495179e", size = 2861861, upload-time = "2025-10-24T19:04:19.01Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/1d/a641a88b69994f9371bd347f1dd35e5d1e2e2460a2e350c8d5165fc62005/hf_xet-1.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a212e842647b02eb6a911187dc878e79c4aa0aa397e88dd3b26761676e8c1f8", size = 2717699, upload-time = "2025-10-24T19:04:17.306Z" },
-    { url = "https://files.pythonhosted.org/packages/df/e0/e5e9bba7d15f0318955f7ec3f4af13f92e773fbb368c0b8008a5acbcb12f/hf_xet-1.2.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e06daccb3a7d4c065f34fc26c14c74f4653069bb2b194e7f18f17cbe9939c0", size = 3314885, upload-time = "2025-10-24T19:04:07.642Z" },
-    { url = "https://files.pythonhosted.org/packages/21/90/b7fe5ff6f2b7b8cbdf1bd56145f863c90a5807d9758a549bf3d916aa4dec/hf_xet-1.2.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:29c8fc913a529ec0a91867ce3d119ac1aac966e098cf49501800c870328cc090", size = 3221550, upload-time = "2025-10-24T19:04:05.55Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/cb/73f276f0a7ce46cc6a6ec7d6c7d61cbfe5f2e107123d9bbd0193c355f106/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e159cbfcfbb29f920db2c09ed8b660eb894640d284f102ada929b6e3dc410a", size = 3408010, upload-time = "2025-10-24T19:04:28.598Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/1e/d642a12caa78171f4be64f7cd9c40e3ca5279d055d0873188a58c0f5fbb9/hf_xet-1.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c91d5ae931510107f148874e9e2de8a16052b6f1b3ca3c1b12f15ccb491390f", size = 3503264, upload-time = "2025-10-24T19:04:30.397Z" },
-    { url = "https://files.pythonhosted.org/packages/17/b5/33764714923fa1ff922770f7ed18c2daae034d21ae6e10dbf4347c854154/hf_xet-1.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:210d577732b519ac6ede149d2f2f34049d44e8622bf14eb3d63bbcd2d4b332dc", size = 2901071, upload-time = "2025-10-24T19:04:37.463Z" },
     { url = "https://files.pythonhosted.org/packages/96/2d/22338486473df5923a9ab7107d375dbef9173c338ebef5098ef593d2b560/hf_xet-1.2.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:46740d4ac024a7ca9b22bebf77460ff43332868b661186a8e46c227fdae01848", size = 2866099, upload-time = "2025-10-24T19:04:15.366Z" },
     { url = "https://files.pythonhosted.org/packages/7f/8c/c5becfa53234299bc2210ba314eaaae36c2875e0045809b82e40a9544f0c/hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:27df617a076420d8845bea087f59303da8be17ed7ec0cd7ee3b9b9f579dff0e4", size = 2722178, upload-time = "2025-10-24T19:04:13.695Z" },
     { url = "https://files.pythonhosted.org/packages/9a/92/cf3ab0b652b082e66876d08da57fcc6fa2f0e6c70dfbbafbd470bb73eb47/hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3651fd5bfe0281951b988c0facbe726aa5e347b103a675f49a3fa8144c7968fd", size = 3320214, upload-time = "2025-10-24T19:04:03.596Z" },
@@ -2180,25 +2151,15 @@ name = "hf-xet"
 version = "1.4.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/53/92/ec9ad04d0b5728dca387a45af7bc98fbb0d73b2118759f5f6038b61a57e8/hf_xet-1.4.3.tar.gz", hash = "sha256:8ddedb73c8c08928c793df2f3401ec26f95be7f7e516a7bee2fbb546f6676113", size = 670477, upload-time = "2026-03-31T22:40:07.874Z" }
 wheels = [
@@ -2210,14 +2171,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1b/c4/39d6e136cbeea9ca5a23aad4b33024319222adbdc059ebcda5fc7d9d5ff4/hf_xet-1.4.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2815a49a7a59f3e2edf0cf113ae88e8cb2ca2a221bf353fb60c609584f4884d4", size = 4424525, upload-time = "2026-03-31T22:40:00.225Z" },
     { url = "https://files.pythonhosted.org/packages/46/f2/adc32dae6bdbc367853118b9878139ac869419a4ae7ba07185dc31251b76/hf_xet-1.4.3-cp313-cp313t-win_amd64.whl", hash = "sha256:42ee323265f1e6a81b0e11094564fb7f7e0ec75b5105ffd91ae63f403a11931b", size = 3671610, upload-time = "2026-03-31T22:40:10.42Z" },
     { url = "https://files.pythonhosted.org/packages/e2/19/25d897dcc3f81953e0c2cde9ec186c7a0fee413eb0c9a7a9130d87d94d3a/hf_xet-1.4.3-cp313-cp313t-win_arm64.whl", hash = "sha256:27c976ba60079fb8217f485b9c5c7fcd21c90b0367753805f87cb9f3cdc4418a", size = 3528529, upload-time = "2026-03-31T22:40:09.106Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/36/3e8f85ca9fe09b8de2b2e10c63b3b3353d7dda88a0b3d426dffbe7b8313b/hf_xet-1.4.3-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:5251d5ece3a81815bae9abab41cf7ddb7bcb8f56411bce0827f4a3071c92fdc6", size = 3801019, upload-time = "2026-03-31T22:39:56.651Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/9c/defb6cb1de28bccb7bd8d95f6e60f72a3d3fa4cb3d0329c26fb9a488bfe7/hf_xet-1.4.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1feb0f3abeacee143367c326a128a2e2b60868ec12a36c225afb1d6c5a05e6d2", size = 3558746, upload-time = "2026-03-31T22:39:54.766Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/bd/8d001191893178ff8e826e46ad5299446e62b93cd164e17b0ffea08832ec/hf_xet-1.4.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b301fc150290ca90b4fccd079829b84bb4786747584ae08b94b4577d82fb791", size = 4207692, upload-time = "2026-03-31T22:39:46.246Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/48/6790b402803250e9936435613d3a78b9aaeee7973439f0918848dde58309/hf_xet-1.4.3-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:d972fbe95ddc0d3c0fc49b31a8a69f47db35c1e3699bf316421705741aab6653", size = 3986281, upload-time = "2026-03-31T22:39:44.648Z" },
-    { url = "https://files.pythonhosted.org/packages/51/56/ea62552fe53db652a9099eda600b032d75554d0e86c12a73824bfedef88b/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c5b48db1ee344a805a1b9bd2cda9b6b65fe77ed3787bd6e87ad5521141d317cd", size = 4187414, upload-time = "2026-03-31T22:40:04.951Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/f5/bc1456d4638061bea997e6d2db60a1a613d7b200e0755965ec312dc1ef79/hf_xet-1.4.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:22bdc1f5fb8b15bf2831440b91d1c9bbceeb7e10c81a12e8d75889996a5c9da8", size = 4424368, upload-time = "2026-03-31T22:40:06.347Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/76/ab597bae87e1f06d18d3ecb8ed7f0d3c9a37037fc32ce76233d369273c64/hf_xet-1.4.3-cp314-cp314t-win_amd64.whl", hash = "sha256:0392c79b7cf48418cd61478c1a925246cf10639f4cd9d94368d8ca1e8df9ea07", size = 3672280, upload-time = "2026-03-31T22:40:16.401Z" },
-    { url = "https://files.pythonhosted.org/packages/62/05/2e462d34e23a09a74d73785dbed71cc5dbad82a72eee2ad60a72a554155d/hf_xet-1.4.3-cp314-cp314t-win_arm64.whl", hash = "sha256:681c92a07796325778a79d76c67011764ecc9042a8c3579332b61b63ae512075", size = 3528945, upload-time = "2026-03-31T22:40:14.995Z" },
     { url = "https://files.pythonhosted.org/packages/ac/9f/9c23e4a447b8f83120798f9279d0297a4d1360bdbf59ef49ebec78fe2545/hf_xet-1.4.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:d0da85329eaf196e03e90b84c2d0aca53bd4573d097a75f99609e80775f98025", size = 3805048, upload-time = "2026-03-31T22:39:53.105Z" },
     { url = "https://files.pythonhosted.org/packages/0b/f8/7aacb8e5f4a7899d39c787b5984e912e6c18b11be136ef13947d7a66d265/hf_xet-1.4.3-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e23717ce4186b265f69afa66e6f0069fe7efbf331546f5c313d00e123dc84583", size = 3562178, upload-time = "2026-03-31T22:39:51.295Z" },
     { url = "https://files.pythonhosted.org/packages/df/9a/a24b26dc8a65f0ecc0fe5be981a19e61e7ca963b85e062c083f3a9100529/hf_xet-1.4.3-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc360b70c815bf340ed56c7b8c63aacf11762a4b099b2fe2c9bd6d6068668c08", size = 4212320, upload-time = "2026-03-31T22:39:42.922Z" },
@@ -2276,7 +2229,7 @@ name = "httpx-retries"
 version = "0.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "httpx", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fb/f5/046cac13877ce9b55aebdbb3999e0e45b19b989a95c5fd1040fa04bd1f92/httpx_retries-0.5.0.tar.gz", hash = "sha256:d8c8e1e0852d84be3837aba0bcf78aeb89a4b77db95e8cc988c8c058830b3044", size = 15647, upload-time = "2026-04-20T01:21:47.154Z" }
 wheels = [
@@ -2319,9 +2272,9 @@ name = "hydra-core"
 version = "1.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "antlr4-python3-runtime", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "omegaconf", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "packaging", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "antlr4-python3-runtime" },
+    { name = "omegaconf" },
+    { name = "packaging" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494, upload-time = "2023-02-23T18:33:43.03Z" }
 wheels = [
@@ -2421,6 +2374,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6b/c7/f6fd3db6c33a164631c39dce2ca26a3794e3abf91b875cc99a43a5565d88/iso639_lang-2.6.3-py3-none-any.whl", hash = "sha256:a6c2fb9f739dca180dc7f48b098880f303bcce2cdf93a4ca3152ed8bbbb94fbb", size = 324990, upload-time = "2025-07-23T09:04:52.221Z" },
 ]
 
+[[package]]
+name = "itsdangerous"
+version = "2.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" },
+]
+
 [[package]]
 name = "jieba"
 version = "0.42.1"
@@ -2505,33 +2467,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" },
     { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" },
     { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/1e/354ed92461b165bd581f9ef5150971a572c873ec3b68a916d5aa91da3cc2/jiter-0.14.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:6f396837fc7577871ca8c12edaf239ed9ccef3bbe39904ae9b8b63ce0a48b140", size = 315277, upload-time = "2026-04-10T14:27:18.109Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/95/8c7c7028aa8636ac21b7a55faef3e34215e6ed0cbf5ae58258427f621aa3/jiter-0.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a4d50ea3d8ba4176f79754333bd35f1bbcd28e91adc13eb9b7ca91bc52a6cef9", size = 315923, upload-time = "2026-04-10T14:27:19.603Z" },
-    { url = "https://files.pythonhosted.org/packages/47/40/e2a852a44c4a089f2681a16611b7ce113224a80fd8504c46d78491b47220/jiter-0.14.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce17f8a050447d1b4153bda4fb7d26e6a9e74eb4f4a41913f30934c5075bf615", size = 344943, upload-time = "2026-04-10T14:27:21.262Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/1f/670f92adee1e9895eac41e8a4d623b6da68c4d46249d8b556b60b63f949e/jiter-0.14.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f4f1c4b125e1652aefbc2e2c1617b60a160ab789d180e3d423c41439e5f32850", size = 369725, upload-time = "2026-04-10T14:27:22.766Z" },
-    { url = "https://files.pythonhosted.org/packages/01/2f/541c9ba567d05de1c4874a0f8f8c5e3fd78e2b874266623da9a775cf46e0/jiter-0.14.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be808176a6a3a14321d18c603f2d40741858a7c4fc982f83232842689fe86dd9", size = 461210, upload-time = "2026-04-10T14:27:24.315Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/a9/c31cbec09627e0d5de7aeaec7690dba03e090caa808fefd8133137cf45bc/jiter-0.14.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26679d58ba816f88c3849306dd58cb863a90a1cf352cdd4ef67e30ccf8a77994", size = 380002, upload-time = "2026-04-10T14:27:26.155Z" },
-    { url = "https://files.pythonhosted.org/packages/50/02/3c05c1666c41904a2f607475a73e7a4763d1cbde2d18229c4f85b22dc253/jiter-0.14.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80381f5a19af8fa9aef743f080e34f6b25ebd89656475f8cf0470ec6157052aa", size = 354678, upload-time = "2026-04-10T14:27:27.701Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/97/e15b33545c2b13518f560d695f974b9891b311641bdcf178d63177e8801e/jiter-0.14.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:004df5fdb8ecbd6d99f3227df18ba1a259254c4359736a2e6f036c944e02d7c5", size = 358920, upload-time = "2026-04-10T14:27:29.256Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/d2/8b1461def6b96ba44530df20d07ef7a1c7da22f3f9bf1727e2d611077bf1/jiter-0.14.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cff5708f7ed0fa098f2b53446c6fa74c48469118e5cd7497b4f1cd569ab06928", size = 394512, upload-time = "2026-04-10T14:27:31.344Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/88/837566dd6ed6e452e8d3205355afd484ce44b2533edfa4ed73a298ea893e/jiter-0.14.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:2492e5f06c36a976d25c7cc347a60e26d5470178d44cde1b9b75e60b4e519f28", size = 521120, upload-time = "2026-04-10T14:27:33.299Z" },
-    { url = "https://files.pythonhosted.org/packages/89/6b/b00b45c4d1b4c031777fe161d620b755b5b02cdade1e316dcb46e4471d63/jiter-0.14.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7609cfbe3a03d37bfdbf5052012d5a879e72b83168a363deae7b3a26564d57de", size = 553668, upload-time = "2026-04-10T14:27:34.868Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/d8/6fe5b42011d19397433d345716eac16728ac241862a2aac9c91923c7509a/jiter-0.14.0-cp314-cp314-win32.whl", hash = "sha256:7282342d32e357543565286b6450378c3cd402eea333fc1ebe146f1fabb306fc", size = 207001, upload-time = "2026-04-10T14:27:36.455Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/43/5c2e08da1efad5e410f0eaaabeadd954812612c33fbbd8fd5328b489139d/jiter-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:bd77945f38866a448e73b0b7637366afa814d4617790ecd88a18ca74377e6c02", size = 202187, upload-time = "2026-04-10T14:27:38Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/1f/6e39ac0b4cdfa23e606af5b245df5f9adaa76f35e0c5096790da430ca506/jiter-0.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:f2d4c61da0821ee42e0cdf5489da60a6d074306313a377c2b35af464955a3611", size = 192257, upload-time = "2026-04-10T14:27:39.504Z" },
-    { url = "https://files.pythonhosted.org/packages/05/57/7dbc0ffbbb5176a27e3518716608aa464aee2e2887dc938f0b900a120449/jiter-0.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1bf7ff85517dd2f20a5750081d2b75083c1b269cf75afc7511bdf1f9548beb3b", size = 323441, upload-time = "2026-04-10T14:27:41.039Z" },
-    { url = "https://files.pythonhosted.org/packages/83/6e/7b3314398d8983f06b557aa21b670511ec72d3b79a68ee5e4d9bff972286/jiter-0.14.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ef8791c3e78d6c6b157c6d360fbb5c715bebb8113bc6a9303c5caff012754a", size = 348109, upload-time = "2026-04-10T14:27:42.552Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/4f/8dc674bcd7db6dba566de73c08c763c337058baff1dbeb34567045b27cdc/jiter-0.14.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e74663b8b10da1fe0f4e4703fd7980d24ad17174b6bb35d8498d6e3ebce2ae6a", size = 368328, upload-time = "2026-04-10T14:27:44.574Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/5f/188e09a1f20906f98bbdec44ed820e19f4e8eb8aff88b9d1a5a497587ff3/jiter-0.14.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1aca29ba52913f78362ec9c2da62f22cdc4c3083313403f90c15460979b84d9b", size = 463301, upload-time = "2026-04-10T14:27:46.717Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/f0/19046ef965ed8f349e8554775bb12ff4352f443fbe12b95d31f575891256/jiter-0.14.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b39b7d87a952b79949af5fef44d2544e58c21a28da7f1bae3ef166455c61746", size = 378891, upload-time = "2026-04-10T14:27:48.32Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/c3/da43bd8431ee175695777ee78cf0e93eacbb47393ff493f18c45231b427d/jiter-0.14.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d918a68b26e9fab068c2b5453577ef04943ab2807b9a6275df2a812599a310", size = 360749, upload-time = "2026-04-10T14:27:49.88Z" },
-    { url = "https://files.pythonhosted.org/packages/72/26/e054771be889707c6161dbdec9c23d33a9ec70945395d70f07cfea1e9a6f/jiter-0.14.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:b08997c35aee1201c1a5361466a8fb9162d03ae7bf6568df70b6c859f1e654a4", size = 358526, upload-time = "2026-04-10T14:27:51.504Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/0f/7bea65ea2a6d91f2bf989ff11a18136644392bf2b0497a1fa50934c30a9c/jiter-0.14.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:260bf7ca20704d58d41f669e5e9fe7fe2fa72901a6b324e79056f5d52e9c9be2", size = 393926, upload-time = "2026-04-10T14:27:53.368Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/a1/b1ff7d70deef61ac0b7c6c2f12d2ace950cdeecb4fdc94500a0926802857/jiter-0.14.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:37826e3df29e60f30a382f9294348d0238ef127f4b5d7f5f8da78b5b9e050560", size = 521052, upload-time = "2026-04-10T14:27:55.058Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/7b/3b0649983cbaf15eda26a414b5b1982e910c67bd6f7b1b490f3cfc76896a/jiter-0.14.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:645be49c46f2900937ba0eaf871ad5183c96858c0af74b6becc7f4e367e36e06", size = 553716, upload-time = "2026-04-10T14:27:57.269Z" },
-    { url = "https://files.pythonhosted.org/packages/97/f8/33d78c83bd93ae0c0af05293a6660f88a1977caef39a6d72a84afab94ce0/jiter-0.14.0-cp314-cp314t-win32.whl", hash = "sha256:2f7877ed45118de283786178eceaf877110abacd04fde31efff3940ae9672674", size = 207957, upload-time = "2026-04-10T14:27:59.285Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/ac/2b760516c03e2227826d1f7025d89bf6bf6357a28fe75c2a2800873c50bf/jiter-0.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:14c0cb10337c49f5eafe8e7364daca5e29a020ea03580b8f8e6c597fed4e1588", size = 204690, upload-time = "2026-04-10T14:28:00.962Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" },
     { url = "https://files.pythonhosted.org/packages/32/a1/ef34ca2cab2962598591636a1804b93645821201cc0095d4a93a9a329c9d/jiter-0.14.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:a25ffa2dbbdf8721855612f6dca15c108224b12d0c4024d0ac3d7902132b4211", size = 311366, upload-time = "2026-04-10T14:28:27.943Z" },
     { url = "https://files.pythonhosted.org/packages/60/bb/520576a532a6b8a6f42747afed289c8448c879a34d7802fe2c832d4fd38f/jiter-0.14.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:0ac9cbaa86c10996b92bd12c91659b60f939f8e28fcfa6bc11a0e90a774ce95b", size = 309873, upload-time = "2026-04-10T14:28:29.688Z" },
     { url = "https://files.pythonhosted.org/packages/b2/7c/c16db114ea1f2f532f198aa8dc39585026af45af362c69a0492f31bc4821/jiter-0.14.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:844e73b6c56b505e9e169234ea3bdea2ea43f769f847f47ac559ba1d2361ebea", size = 344816, upload-time = "2026-04-10T14:28:31.348Z" },
@@ -2634,21 +2569,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/0c/ce/26b3c59d5a18ac6cfb1b59f4eac4d5e1b8fd0f4b0f376b2ec6f1a9f6f56c/jsonpath_rust_bindings-1.1.1-cp313-cp313-win32.whl", hash = "sha256:528038bf902e9f831ab9dff4b0f777aa6dc3e17454ee07a21fa84277d4cae768", size = 750011, upload-time = "2025-11-16T19:04:05.99Z" },
     { url = "https://files.pythonhosted.org/packages/66/18/d427cc16e1974503720106abe190b882d27b101f3ba83aa49deb04022ecb/jsonpath_rust_bindings-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:e364eb3234ee5f36bbf40d2409979ac6f4dd6a04f89438137649174d0ac8a0a3", size = 808602, upload-time = "2025-11-16T19:03:48.912Z" },
     { url = "https://files.pythonhosted.org/packages/c3/c5/5e4edf2a76d1d07a1ea343ca9cf859a23b0819867291814be05c7afeedff/jsonpath_rust_bindings-1.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:924d993c08c807e54fcfcf2ad21a51c663f9af329d14d1bc3d8b21902a0c9741", size = 747523, upload-time = "2025-11-16T19:03:40.416Z" },
-    { url = "https://files.pythonhosted.org/packages/54/6d/86076c2eb1aa886fa2e22a86ef88c4eac83360ac94f77ac7eeafc80ffc3d/jsonpath_rust_bindings-1.1.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:1ae390842ed00983b5b2b648f5e87f4dce6c8435fc3b5d54987fe55197bbd08f", size = 899275, upload-time = "2025-11-16T19:02:18.403Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/22/24f8aaa63ff342b441ce1f32e206e533f8d0ece19e89955313fc7243d884/jsonpath_rust_bindings-1.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:62d2d07129e49b5e8306c492ba88890fffeaa7d5dcf483ceef05f1ea371d0656", size = 814873, upload-time = "2025-11-16T19:02:01.488Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/6a/099c1ee9c1eb5e717d0eb70236dae34c665e1ad29cc742f86711b142a243/jsonpath_rust_bindings-1.1.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2baad20da8f4800599e9294834cfc38acadd59ababb31e1b67dce01377ee30ff", size = 832311, upload-time = "2025-11-16T19:00:20.244Z" },
-    { url = "https://files.pythonhosted.org/packages/20/75/3acfd6f1b158f6455c8795d2752282cb3de73eebcc6fb1ee95c536abc9ab/jsonpath_rust_bindings-1.1.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:082a6638ba126bdfa2be38b1126c954d8fb7f87358d8abfc1d993ec9ebb0894c", size = 836867, upload-time = "2025-11-16T19:00:36.872Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/0d/ab8bc58dbefc1b4ea8b44506c8ab9ac2bf84629b9f8f6e2dd064eb19d4c5/jsonpath_rust_bindings-1.1.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e72af8d566b4e08e212f67d49061dd82df0db9a6a295d1a3d5b78d0566a4ef4", size = 931945, upload-time = "2025-11-16T19:00:53.827Z" },
-    { url = "https://files.pythonhosted.org/packages/63/42/fe917a76245d3effcf89c3d70b854a5fbdf5c23624a8114c13f45626efe3/jsonpath_rust_bindings-1.1.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:96fb734bae80a098b5876a9b44677c023cd7b93346a4d28a18bdef086cc21c71", size = 959706, upload-time = "2025-11-16T19:01:10.181Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/dc/8364a55d3103c6ce38e61b80f3a3775302a049f5e16e5d741ead5037c505/jsonpath_rust_bindings-1.1.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b4fc4106d74d3cb5cb57a6c35b682db50662b9bb501dffecd5cdcdeee9a8367", size = 947323, upload-time = "2025-11-16T19:01:42.702Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/00/ff1d7dc2ff75e91a9f0835df53b8fa9eef1aa805e105999149d8ebfb3211/jsonpath_rust_bindings-1.1.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a95feebca3a7ff2aab3429a18ddea5247c0c2ed5370cefd78d412c5f78191e2a", size = 943743, upload-time = "2025-11-16T19:01:26.22Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/f5/cc58c47854132565b6e36aef5aa48b4c35521a1230d3fbf9e4468be708ee/jsonpath_rust_bindings-1.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ca94d86cdeb623a711a20780fbf947d60d0fffcf2fb1276239ca89590cfdf03d", size = 1067125, upload-time = "2025-11-16T19:02:34.809Z" },
-    { url = "https://files.pythonhosted.org/packages/95/9f/a1256090c3f65db067e96ed528c40806917783091587a81f2063df18ea4f/jsonpath_rust_bindings-1.1.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:eb54794b19ad60a6d0a778fac8588e54db8985da894548a25457edd702ec40e9", size = 1149151, upload-time = "2025-11-16T19:02:51.238Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/50/3fb9e66e60fe569587c824e7d2c147931e33704b7a990794974386afd047/jsonpath_rust_bindings-1.1.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ccccf83576a332d916a1dc053735e64aa4adfb40f14cefc1600df7b35d06f3c9", size = 1106555, upload-time = "2025-11-16T19:03:07.811Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/55/be1bd6a6250761387a76b032de18f01bf0e6e2a5704766ae107013970669/jsonpath_rust_bindings-1.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e242a58b5d2aacdc2346e2a6cdfa8f1d90f92e3d9e76518b63b3402e91551a10", size = 1134317, upload-time = "2025-11-16T19:03:25.423Z" },
-    { url = "https://files.pythonhosted.org/packages/25/23/25afbb1938708d1a546483dca7c24b8277547da2c30527ceed44bd73171c/jsonpath_rust_bindings-1.1.1-cp314-cp314-win32.whl", hash = "sha256:085e3d33ded7a1f838bd4374fd2bf1eb418076efe02b4740f4066b09b5b61a88", size = 750319, upload-time = "2025-11-16T19:04:07.546Z" },
-    { url = "https://files.pythonhosted.org/packages/29/06/ff2042c07e7f7dab0decd2b2cd0a1022a5d01510e85496a1a736f1d40b22/jsonpath_rust_bindings-1.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:edd8b32f2aa82f33205f460035c28641a856834662df0dfb6323cb330f16690c", size = 808653, upload-time = "2025-11-16T19:03:50.491Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/0f/bdf99abf8b08eebcb5f96e6da06dcc06017cb527fc5f199bf7ce033ab13d/jsonpath_rust_bindings-1.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:6ba310703af5b437a5e4e041a2f36d1fcd744152ea6e1e83bfcf78bca032bfe3", size = 747614, upload-time = "2025-11-16T19:03:41.965Z" },
     { url = "https://files.pythonhosted.org/packages/7e/40/cbe5d67ed516912b93910858b28a2449aaed526026ba696ec9fd497b2b44/jsonpath_rust_bindings-1.1.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a823137a703d22eaf8038ec574eddbf279a3008bc63b67905dd6cb9a53dd9e5b", size = 898463, upload-time = "2025-11-16T19:02:23.386Z" },
     { url = "https://files.pythonhosted.org/packages/6f/65/93a4033bb9ee9d3709cbc949947bfa408f53705d173c4795e660a01003b3/jsonpath_rust_bindings-1.1.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c91691b50b733d124272ae5ea525291ba9c5772f6b970ba61a4d33d48d3a2976", size = 815097, upload-time = "2025-11-16T19:02:07.35Z" },
     { url = "https://files.pythonhosted.org/packages/81/16/406665b6ede59573bdb9e69b4054cd09166d31e5ab6c460beb502df8151b/jsonpath_rust_bindings-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50e2ca86dab1197106beccabee4e02fb0f6fb366dd22008a5bbd3bd9ff154fc5", size = 833299, upload-time = "2025-11-16T19:00:25.257Z" },
@@ -2750,8 +2670,8 @@ name = "libcst"
 version = "1.8.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyyaml", marker = "python_full_version != '3.13.*'" },
-    { name = "pyyaml-ft", marker = "python_full_version == '3.13.*'" },
+    { name = "pyyaml", marker = "python_full_version < '3.13'" },
+    { name = "pyyaml-ft", marker = "python_full_version >= '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/de/cd/337df968b38d94c5aabd3e1b10630f047a2b345f6e1d4456bd9fe7417537/libcst-1.8.6.tar.gz", hash = "sha256:f729c37c9317126da9475bdd06a7208eb52fcbd180a6341648b45a56b4ba708b", size = 891354, upload-time = "2025-11-03T22:33:30.621Z" }
 wheels = [
@@ -2795,22 +2715,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/cd/15762659a3f5799d36aab1bc2b7e732672722e249d7800e3c5f943b41250/libcst-1.8.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f04febcd70e1e67917be7de513c8d4749d2e09206798558d7fe632134426ea4", size = 2392661, upload-time = "2025-11-03T22:32:47.232Z" },
     { url = "https://files.pythonhosted.org/packages/e4/6b/b7f9246c323910fcbe021241500f82e357521495dcfe419004dbb272c7cb/libcst-1.8.6-cp313-cp313t-win_amd64.whl", hash = "sha256:1dc3b897c8b0f7323412da3f4ad12b16b909150efc42238e19cbf19b561cc330", size = 2105068, upload-time = "2025-11-03T22:32:49.145Z" },
     { url = "https://files.pythonhosted.org/packages/a6/0b/4fd40607bc4807ec2b93b054594373d7fa3d31bb983789901afcb9bcebe9/libcst-1.8.6-cp313-cp313t-win_arm64.whl", hash = "sha256:44f38139fa95e488db0f8976f9c7ca39a64d6bc09f2eceef260aa1f6da6a2e42", size = 1985181, upload-time = "2025-11-03T22:32:50.597Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/60/4105441989e321f7ad0fd28ffccb83eb6aac0b7cfb0366dab855dcccfbe5/libcst-1.8.6-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:b188e626ce61de5ad1f95161b8557beb39253de4ec74fc9b1f25593324a0279c", size = 2204202, upload-time = "2025-11-03T22:32:52.311Z" },
-    { url = "https://files.pythonhosted.org/packages/67/2f/51a6f285c3a183e50cfe5269d4a533c21625aac2c8de5cdf2d41f079320d/libcst-1.8.6-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:87e74f7d7dfcba9efa91127081e22331d7c42515f0a0ac6e81d4cf2c3ed14661", size = 2083581, upload-time = "2025-11-03T22:32:54.269Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/64/921b1c19b638860af76cdb28bc81d430056592910b9478eea49e31a7f47a/libcst-1.8.6-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:3a926a4b42015ee24ddfc8ae940c97bd99483d286b315b3ce82f3bafd9f53474", size = 2236495, upload-time = "2025-11-03T22:32:55.723Z" },
-    { url = "https://files.pythonhosted.org/packages/12/a8/b00592f9bede618cbb3df6ffe802fc65f1d1c03d48a10d353b108057d09c/libcst-1.8.6-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:3f4fbb7f569e69fd9e89d9d9caa57ca42c577c28ed05062f96a8c207594e75b8", size = 2301466, upload-time = "2025-11-03T22:32:57.337Z" },
-    { url = "https://files.pythonhosted.org/packages/af/df/790d9002f31580fefd0aec2f373a0f5da99070e04c5e8b1c995d0104f303/libcst-1.8.6-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:08bd63a8ce674be431260649e70fca1d43f1554f1591eac657f403ff8ef82c7a", size = 2300264, upload-time = "2025-11-03T22:32:58.852Z" },
-    { url = "https://files.pythonhosted.org/packages/21/de/dc3f10e65bab461be5de57850d2910a02c24c3ddb0da28f0e6e4133c3487/libcst-1.8.6-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e00e275d4ba95d4963431ea3e409aa407566a74ee2bf309a402f84fc744abe47", size = 2408572, upload-time = "2025-11-03T22:33:00.552Z" },
-    { url = "https://files.pythonhosted.org/packages/20/3b/35645157a7590891038b077db170d6dd04335cd2e82a63bdaa78c3297dfe/libcst-1.8.6-cp314-cp314-win_amd64.whl", hash = "sha256:fea5c7fa26556eedf277d4f72779c5ede45ac3018650721edd77fd37ccd4a2d4", size = 2193917, upload-time = "2025-11-03T22:33:02.354Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/a2/1034a9ba7d3e82f2c2afaad84ba5180f601aed676d92b76325797ad60951/libcst-1.8.6-cp314-cp314-win_arm64.whl", hash = "sha256:bb9b4077bdf8857b2483879cbbf70f1073bc255b057ec5aac8a70d901bb838e9", size = 2078748, upload-time = "2025-11-03T22:33:03.707Z" },
-    { url = "https://files.pythonhosted.org/packages/95/a1/30bc61e8719f721a5562f77695e6154e9092d1bdf467aa35d0806dcd6cea/libcst-1.8.6-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:55ec021a296960c92e5a33b8d93e8ad4182b0eab657021f45262510a58223de1", size = 2188980, upload-time = "2025-11-03T22:33:05.152Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/14/c660204532407c5628e3b615015a902ed2d0b884b77714a6bdbe73350910/libcst-1.8.6-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ba9ab2b012fbd53b36cafd8f4440a6b60e7e487cd8b87428e57336b7f38409a4", size = 2074828, upload-time = "2025-11-03T22:33:06.864Z" },
-    { url = "https://files.pythonhosted.org/packages/82/e2/c497c354943dff644749f177ee9737b09ed811b8fc842b05709a40fe0d1b/libcst-1.8.6-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c0a0cc80aebd8aa15609dd4d330611cbc05e9b4216bcaeabba7189f99ef07c28", size = 2225568, upload-time = "2025-11-03T22:33:08.354Z" },
-    { url = "https://files.pythonhosted.org/packages/86/ef/45999676d07bd6d0eefa28109b4f97124db114e92f9e108de42ba46a8028/libcst-1.8.6-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:42a4f68121e2e9c29f49c97f6154e8527cd31021809cc4a941c7270aa64f41aa", size = 2286523, upload-time = "2025-11-03T22:33:10.206Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/6c/517d8bf57d9f811862f4125358caaf8cd3320a01291b3af08f7b50719db4/libcst-1.8.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a434c521fadaf9680788b50d5c21f4048fa85ed19d7d70bd40549fbaeeecab1", size = 2288044, upload-time = "2025-11-03T22:33:11.628Z" },
-    { url = "https://files.pythonhosted.org/packages/83/ce/24d7d49478ffb61207f229239879845da40a374965874f5ee60f96b02ddb/libcst-1.8.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6a65f844d813ab4ef351443badffa0ae358f98821561d19e18b3190f59e71996", size = 2392605, upload-time = "2025-11-03T22:33:12.962Z" },
-    { url = "https://files.pythonhosted.org/packages/39/c3/829092ead738b71e96a4e96896c96f276976e5a8a58b4473ed813d7c962b/libcst-1.8.6-cp314-cp314t-win_amd64.whl", hash = "sha256:bdb14bc4d4d83a57062fed2c5da93ecb426ff65b0dc02ddf3481040f5f074a82", size = 2181581, upload-time = "2025-11-03T22:33:14.514Z" },
-    { url = "https://files.pythonhosted.org/packages/98/6d/5d6a790a02eb0d9d36c4aed4f41b277497e6178900b2fa29c35353aa45ed/libcst-1.8.6-cp314-cp314t-win_arm64.whl", hash = "sha256:819c8081e2948635cab60c603e1bbdceccdfe19104a242530ad38a36222cb88f", size = 2065000, upload-time = "2025-11-03T22:33:16.257Z" },
 ]
 
 [[package]]
@@ -2943,28 +2847,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/77/e8/4598413aece46ca38d9260ef6c51534bd5f34b5c21474fcf210ce3a02123/librt-0.7.3-cp313-cp313-win32.whl", hash = "sha256:44b3689b040df57f492e02cd4f0bacd1b42c5400e4b8048160c9d5e866de8abe", size = 47936, upload-time = "2025-12-06T19:04:02.054Z" },
     { url = "https://files.pythonhosted.org/packages/af/80/ac0e92d5ef8c6791b3e2c62373863827a279265e0935acdf807901353b0e/librt-0.7.3-cp313-cp313-win_amd64.whl", hash = "sha256:6b407c23f16ccc36614c136251d6b32bf30de7a57f8e782378f1107be008ddb0", size = 54965, upload-time = "2025-12-06T19:04:03.224Z" },
     { url = "https://files.pythonhosted.org/packages/f1/fd/042f823fcbff25c1449bb4203a29919891ca74141b68d3a5f6612c4ce283/librt-0.7.3-cp313-cp313-win_arm64.whl", hash = "sha256:abfc57cab3c53c4546aee31859ef06753bfc136c9d208129bad23e2eca39155a", size = 48350, upload-time = "2025-12-06T19:04:04.234Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/ae/c6ecc7bb97134a71b5241e8855d39964c0e5f4d96558f0d60593892806d2/librt-0.7.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:120dd21d46ff875e849f1aae19346223cf15656be489242fe884036b23d39e93", size = 55175, upload-time = "2025-12-06T19:04:05.308Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/bc/2cc0cb0ab787b39aa5c7645cd792433c875982bdf12dccca558b89624594/librt-0.7.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1617bea5ab31266e152871208502ee943cb349c224846928a1173c864261375e", size = 56881, upload-time = "2025-12-06T19:04:06.674Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/87/397417a386190b70f5bf26fcedbaa1515f19dce33366e2684c6b7ee83086/librt-0.7.3-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:93b2a1f325fefa1482516ced160c8c7b4b8d53226763fa6c93d151fa25164207", size = 163710, upload-time = "2025-12-06T19:04:08.437Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/37/7338f85b80e8a17525d941211451199845093ca242b32efbf01df8531e72/librt-0.7.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3d4801db8354436fd3936531e7f0e4feb411f62433a6b6cb32bb416e20b529f", size = 172471, upload-time = "2025-12-06T19:04:10.124Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/e0/741704edabbfae2c852fedc1b40d9ed5a783c70ed3ed8e4fe98f84b25d13/librt-0.7.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11ad45122bbed42cfc8b0597450660126ef28fd2d9ae1a219bc5af8406f95678", size = 186804, upload-time = "2025-12-06T19:04:11.586Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/d1/0a82129d6ba242f3be9af34815be089f35051bc79619f5c27d2c449ecef6/librt-0.7.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6b4e7bff1d76dd2b46443078519dc75df1b5e01562345f0bb740cea5266d8218", size = 181817, upload-time = "2025-12-06T19:04:12.802Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/32/704f80bcf9979c68d4357c46f2af788fbf9d5edda9e7de5786ed2255e911/librt-0.7.3-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:d86f94743a11873317094326456b23f8a5788bad9161fd2f0e52088c33564620", size = 175602, upload-time = "2025-12-06T19:04:14.004Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/6d/4355cfa0fae0c062ba72f541d13db5bc575770125a7ad3d4f46f4109d305/librt-0.7.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:754a0d09997095ad764ccef050dd5bf26cbf457aab9effcba5890dad081d879e", size = 196497, upload-time = "2025-12-06T19:04:15.487Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/eb/ac6d8517d44209e5a712fde46f26d0055e3e8969f24d715f70bd36056230/librt-0.7.3-cp314-cp314-win32.whl", hash = "sha256:fbd7351d43b80d9c64c3cfcb50008f786cc82cba0450e8599fdd64f264320bd3", size = 44678, upload-time = "2025-12-06T19:04:16.688Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/93/238f026d141faf9958da588c761a0812a1a21c98cc54a76f3608454e4e59/librt-0.7.3-cp314-cp314-win_amd64.whl", hash = "sha256:d376a35c6561e81d2590506804b428fc1075fcc6298fc5bb49b771534c0ba010", size = 51689, upload-time = "2025-12-06T19:04:17.726Z" },
-    { url = "https://files.pythonhosted.org/packages/52/44/43f462ad9dcf9ed7d3172fe2e30d77b980956250bd90e9889a9cca93df2a/librt-0.7.3-cp314-cp314-win_arm64.whl", hash = "sha256:cbdb3f337c88b43c3b49ca377731912c101178be91cb5071aac48faa898e6f8e", size = 44662, upload-time = "2025-12-06T19:04:18.771Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/35/fed6348915f96b7323241de97f26e2af481e95183b34991df12fd5ce31b1/librt-0.7.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9f0e0927efe87cd42ad600628e595a1a0aa1c64f6d0b55f7e6059079a428641a", size = 57347, upload-time = "2025-12-06T19:04:19.812Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/f2/045383ccc83e3fea4fba1b761796584bc26817b6b2efb6b8a6731431d16f/librt-0.7.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:020c6db391268bcc8ce75105cb572df8cb659a43fd347366aaa407c366e5117a", size = 59223, upload-time = "2025-12-06T19:04:20.862Z" },
-    { url = "https://files.pythonhosted.org/packages/77/3f/c081f8455ab1d7f4a10dbe58463ff97119272ff32494f21839c3b9029c2c/librt-0.7.3-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7af7785f5edd1f418da09a8cdb9ec84b0213e23d597413e06525340bcce1ea4f", size = 183861, upload-time = "2025-12-06T19:04:21.963Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/f5/73c5093c22c31fbeaebc25168837f05ebfd8bf26ce00855ef97a5308f36f/librt-0.7.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8ccadf260bb46a61b9c7e89e2218f6efea9f3eeaaab4e3d1f58571890e54858e", size = 194594, upload-time = "2025-12-06T19:04:23.14Z" },
-    { url = "https://files.pythonhosted.org/packages/78/b8/d5f17d4afe16612a4a94abfded94c16c5a033f183074fb130dfe56fc1a42/librt-0.7.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d9883b2d819ce83f87ba82a746c81d14ada78784db431e57cc9719179847376e", size = 206759, upload-time = "2025-12-06T19:04:24.328Z" },
-    { url = "https://files.pythonhosted.org/packages/36/2e/021765c1be85ee23ffd5b5b968bb4cba7526a4db2a0fc27dcafbdfc32da7/librt-0.7.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:59cb0470612d21fa1efddfa0dd710756b50d9c7fb6c1236bbf8ef8529331dc70", size = 203210, upload-time = "2025-12-06T19:04:25.544Z" },
-    { url = "https://files.pythonhosted.org/packages/77/f0/9923656e42da4fd18c594bd08cf6d7e152d4158f8b808e210d967f0dcceb/librt-0.7.3-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:1fe603877e1865b5fd047a5e40379509a4a60204aa7aa0f72b16f7a41c3f0712", size = 196708, upload-time = "2025-12-06T19:04:26.725Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/0b/0708b886ac760e64d6fbe7e16024e4be3ad1a3629d19489a97e9cf4c3431/librt-0.7.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5460d99ed30f043595bbdc888f542bad2caeb6226b01c33cda3ae444e8f82d42", size = 217212, upload-time = "2025-12-06T19:04:27.892Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/7f/12a73ff17bca4351e73d585dd9ebf46723c4a8622c4af7fe11a2e2d011ff/librt-0.7.3-cp314-cp314t-win32.whl", hash = "sha256:d09f677693328503c9e492e33e9601464297c01f9ebd966ea8fc5308f3069bfd", size = 45586, upload-time = "2025-12-06T19:04:29.116Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/df/8decd032ac9b995e4f5606cde783711a71094128d88d97a52e397daf2c89/librt-0.7.3-cp314-cp314t-win_amd64.whl", hash = "sha256:25711f364c64cab2c910a0247e90b51421e45dbc8910ceeb4eac97a9e132fc6f", size = 53002, upload-time = "2025-12-06T19:04:30.173Z" },
-    { url = "https://files.pythonhosted.org/packages/de/0c/6605b6199de8178afe7efc77ca1d8e6db00453bc1d3349d27605c0f42104/librt-0.7.3-cp314-cp314t-win_arm64.whl", hash = "sha256:a9f9b661f82693eb56beb0605156c7fca57f535704ab91837405913417d6990b", size = 45647, upload-time = "2025-12-06T19:04:31.302Z" },
 ]
 
 [[package]]
@@ -3107,42 +2989,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5e/73/16596f7e4e38fa33084b9ccbccc22a15f82a290a055126f2c1541236d2ff/lxml-6.1.0-cp313-cp313-win32.whl", hash = "sha256:28902146ffbe5222df411c5d19e5352490122e14447e98cd118907ee3fd6ee62", size = 3596938, upload-time = "2026-04-18T04:31:56.206Z" },
     { url = "https://files.pythonhosted.org/packages/8e/63/981401c5680c1eb30893f00a19641ac80db5d1e7086c62cb4b13ed813038/lxml-6.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:4a1503c56e4e2b38dc76f2f2da7bae69670c0f1933e27cfa34b2fa5876410b16", size = 3995728, upload-time = "2026-04-18T04:31:58.763Z" },
     { url = "https://files.pythonhosted.org/packages/e7/e8/c358a38ac3e541d16a1b527e4e9cb78c0419b0506a070ace11777e5e8404/lxml-6.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:e0af85773850417d994d019741239b901b22c6680206f46a34766926e466141d", size = 3658372, upload-time = "2026-04-18T04:32:03.629Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/45/cee4cf203ef0bab5c52afc118da61d6b460c928f2893d40023cfa27e0b80/lxml-6.1.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:ab863fd37458fed6456525f297d21239d987800c46e67da5ef04fc6b3dd93ac8", size = 8576713, upload-time = "2026-04-18T04:32:06.831Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/a7/eda05babeb7e046839204eaf254cd4d7c9130ce2bbf0d9e90ea41af5654d/lxml-6.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6fd8b1df8254ff4fd93fd31da1fc15770bde23ac045be9bb1f87425702f61cc9", size = 4623874, upload-time = "2026-04-18T04:32:10.755Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/e9/db5846de9b436b91890a62f29d80cd849ea17948a49bf532d5278ee69a9e/lxml-6.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:47024feaae386a92a146af0d2aeed65229bf6fff738e6a11dda6b0015fb8fd03", size = 4949535, upload-time = "2026-04-18T04:34:06.657Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/ba/0d3593373dcae1d68f40dc3c41a5a92f2544e68115eb2f62319a4c2a6500/lxml-6.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3f00972f84450204cd5d93a5395965e348956aaceaadec693a22ec743f8ae3eb", size = 5086881, upload-time = "2026-04-18T04:34:09.556Z" },
-    { url = "https://files.pythonhosted.org/packages/43/76/759a7484539ad1af0d125a9afe9c3fb5f82a8779fd1f5f56319d9e4ea2fd/lxml-6.1.0-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97faa0860e13b05b15a51fb4986421ef7a30f0b3334061c416e0981e9450ca4c", size = 5031305, upload-time = "2026-04-18T04:34:12.336Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/b9/c1f0daf981a11e47636126901fd4ab82429e18c57aeb0fc3ad2940b42d8b/lxml-6.1.0-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:972a6451204798675407beaad97b868d0c733d9a74dafefc63120b81b8c2de28", size = 5647522, upload-time = "2026-04-18T04:34:14.89Z" },
-    { url = "https://files.pythonhosted.org/packages/31/e6/1f533dcd205275363d9ba3511bcec52fa2df86abf8abe6a5f2c599f0dc31/lxml-6.1.0-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fe022f20bc4569ec66b63b3fb275a3d628d9d32da6326b2982584104db6d3086", size = 5239310, upload-time = "2026-04-18T04:34:17.652Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/8c/4175fb709c78a6e315ed814ed33be3defd8b8721067e70419a6cf6f971da/lxml-6.1.0-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:75c4c7c619a744f972f4451bf5adf6d0fb00992a1ffc9fd78e13b0bc817cc99f", size = 5350799, upload-time = "2026-04-18T04:34:20.529Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/77/6ffdebc5994975f0dde4acb59761902bd9d9bb84422b9a0bd239a7da9ca8/lxml-6.1.0-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:3648f20d25102a22b6061c688beb3a805099ea4beb0a01ce62975d926944d292", size = 4697693, upload-time = "2026-04-18T04:34:23.541Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/f1/565f36bd5c73294602d48e04d23f81ff4c8736be6ba5e1d1ec670ac9be80/lxml-6.1.0-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:77b9f99b17cbf14026d1e618035077060fc7195dd940d025149f3e2e830fbfcb", size = 5250708, upload-time = "2026-04-18T04:34:26.001Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/11/a68ab9dd18c5c499404deb4005f4bc4e0e88e5b72cd755ad96efec81d18d/lxml-6.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:32662519149fd7a9db354175aa5e417d83485a8039b8aaa62f873ceee7ea4cad", size = 5084737, upload-time = "2026-04-18T04:34:28.32Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/78/e8f41e2c74f4af564e6a0348aea69fb6daaefa64bc071ef469823d22cc18/lxml-6.1.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:73d658216fc173cf2c939e90e07b941c5e12736b0bf6a99e7af95459cfe8eabb", size = 4737817, upload-time = "2026-04-18T04:34:30.784Z" },
-    { url = "https://files.pythonhosted.org/packages/06/2d/aa4e117aa2ce2f3b35d9ff246be74a2f8e853baba5d2a92c64744474603a/lxml-6.1.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ac4db068889f8772a4a698c5980ec302771bb545e10c4b095d4c8be26749616f", size = 5670753, upload-time = "2026-04-18T04:34:33.675Z" },
-    { url = "https://files.pythonhosted.org/packages/08/f5/dd745d50c0409031dbfcc4881740542a01e54d6f0110bd420fa7782110b8/lxml-6.1.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:45e9dfbd1b661eb64ba0d4dbe762bd210c42d86dd1e5bd2bdf89d634231beb43", size = 5238071, upload-time = "2026-04-18T04:34:36.12Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/74/ad424f36d0340a904665867dab310a3f1f4c96ff4039698de83b77f44c1f/lxml-6.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:89e8d73d09ac696a5ba42ec69787913d53284f12092f651506779314f10ba585", size = 5264319, upload-time = "2026-04-18T04:34:39.035Z" },
-    { url = "https://files.pythonhosted.org/packages/53/36/a15d8b3514ec889bfd6aa3609107fcb6c9189f8dc347f1c0b81eded8d87c/lxml-6.1.0-cp314-cp314-win32.whl", hash = "sha256:ebe33f4ec1b2de38ceb225a1749a2965855bffeef435ba93cd2d5d540783bf2f", size = 3657139, upload-time = "2026-04-18T04:32:20.006Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/a4/263ebb0710851a3c6c937180a9a86df1206fdfe53cc43005aa2237fd7736/lxml-6.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:398443df51c538bd578529aa7e5f7afc6c292644174b47961f3bf87fe5741120", size = 4064195, upload-time = "2026-04-18T04:32:23.876Z" },
-    { url = "https://files.pythonhosted.org/packages/80/68/2000f29d323b6c286de077ad20b429fc52272e44eae6d295467043e56012/lxml-6.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:8c8984e1d8c4b3949e419158fda14d921ff703a9ed8a47236c6eb7a2b6cb4946", size = 3741870, upload-time = "2026-04-18T04:32:27.922Z" },
-    { url = "https://files.pythonhosted.org/packages/30/e9/21383c7c8d43799f0da90224c0d7c921870d476ec9b3e01e1b2c0b8237c5/lxml-6.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:1081dd10bc6fa437db2500e13993abf7cc30716d0a2f40e65abb935f02ec559c", size = 8827548, upload-time = "2026-04-18T04:32:15.094Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/01/c6bc11cd587030dd4f719f65c5657960649fe3e19196c844c75bf32cd0d6/lxml-6.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:dabecc48db5f42ba348d1f5d5afdc54c6c4cc758e676926c7cd327045749517d", size = 4735866, upload-time = "2026-04-18T04:32:18.924Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/01/757132fff5f4acf25463b5298f1a46099f3a94480b806547b29ce5e385de/lxml-6.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e3dd5fe19c9e0ac818a9c7f132a5e43c1339ec1cbbfecb1a938bd3a47875b7c9", size = 4969476, upload-time = "2026-04-18T04:34:41.889Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/fb/1bc8b9d27ed64be7c8903db6c89e74dc8c2cd9ec630a7462e4654316dc5b/lxml-6.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9e7b0a4ca6dcc007a4cef00a761bba2dea959de4bd2df98f926b33c92ca5dfb9", size = 5103719, upload-time = "2026-04-18T04:34:44.797Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/e7/5bf82fa28133536a54601aae633b14988e89ed61d4c1eb6b899b023233aa/lxml-6.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d27bbe326c6b539c64b42638b18bc6003a8d88f76213a97ac9ed4f885efeab7", size = 5027890, upload-time = "2026-04-18T04:34:47.634Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/20/e048db5d4b4ea0366648aa595f26bb764b2670903fc585b87436d0a5032c/lxml-6.1.0-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4e425db0c5445ef0ad56b0eec54f89b88b2d884656e536a90b2f52aecb4ca86", size = 5596008, upload-time = "2026-04-18T04:34:51.503Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/c2/d10807bc8da4824b39e5bd01b5d05c077b6fd01bd91584167edf6b269d22/lxml-6.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4b89b098105b8599dc57adac95d1813409ac476d3c948a498775d3d0c6124bfb", size = 5224451, upload-time = "2026-04-18T04:34:54.263Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/15/2ebea45bea427e7f0057e9ce7b2d62c5aba20c6b001cca89ed0aadb3ad41/lxml-6.1.0-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:c4a699432846df86cc3de502ee85f445ebad748a1c6021d445f3e514d2cd4b1c", size = 5312135, upload-time = "2026-04-18T04:34:56.818Z" },
-    { url = "https://files.pythonhosted.org/packages/31/e2/87eeae151b0be2a308d49a7ec444ff3eb192b14251e62addb29d0bf3778f/lxml-6.1.0-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:30e7b2ed63b6c8e97cca8af048589a788ab5c9c905f36d9cf1c2bb549f450d2f", size = 4639126, upload-time = "2026-04-18T04:34:59.704Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/51/8a3f6a20902ad604dd746ec7b4000311b240d389dac5e9d95adefd349e0c/lxml-6.1.0-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:022981127642fe19866d2907d76241bb07ed21749601f727d5d5dd1ce5d1b773", size = 5232579, upload-time = "2026-04-18T04:35:02.658Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/d2/650d619bdbe048d2c3f2c31edb00e35670a5e2d65b4fe3b61bce37b19121/lxml-6.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:23cad0cc86046d4222f7f418910e46b89971c5a45d3c8abfad0f64b7b05e4a9b", size = 5084206, upload-time = "2026-04-18T04:35:05.175Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/8a/672ca1a3cbeabd1f511ca275a916c0514b747f4b85bdaae103b8fa92f307/lxml-6.1.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:21c3302068f50d1e8728c67c87ba92aa87043abee517aa2576cca1855326b405", size = 4758906, upload-time = "2026-04-18T04:35:08.098Z" },
-    { url = "https://files.pythonhosted.org/packages/be/f1/ef4b691da85c916cb2feb1eec7414f678162798ac85e042fa164419ac05c/lxml-6.1.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:be10838781cb3be19251e276910cd508fe127e27c3242e50521521a0f3781690", size = 5620553, upload-time = "2026-04-18T04:35:11.23Z" },
-    { url = "https://files.pythonhosted.org/packages/59/17/94e81def74107809755ac2782fdad4404420f1c92ca83433d117a6d5acf0/lxml-6.1.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2173a7bffe97667bbf0767f8a99e587740a8c56fdf3befac4b09cb29a80276fd", size = 5229458, upload-time = "2026-04-18T04:35:14.254Z" },
-    { url = "https://files.pythonhosted.org/packages/21/55/c4be91b0f830a871fc1b0d730943d56013b683d4671d5198260e2eae722b/lxml-6.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c6854e9cf99c84beb004eecd7d3a3868ef1109bf2b1df92d7bc11e96a36c2180", size = 5247861, upload-time = "2026-04-18T04:35:17.006Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/ca/77123e4d77df3cb1e968ade7b1f808f5d3a5c1c96b18a33895397de292c1/lxml-6.1.0-cp314-cp314t-win32.whl", hash = "sha256:00750d63ef0031a05331b9223463b1c7c02b9004cef2346a5b2877f0f9494dd2", size = 3897377, upload-time = "2026-04-18T04:32:07.656Z" },
-    { url = "https://files.pythonhosted.org/packages/64/ce/3554833989d074267c063209bae8b09815e5656456a2d332b947806b05ff/lxml-6.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:80410c3a7e3c617af04de17caa9f9f20adaa817093293d69eae7d7d0522836f5", size = 4392701, upload-time = "2026-04-18T04:32:12.113Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/a0/9b916c68c0e57752c07f8f64b30138d9d4059dbeb27b90274dedbea128ff/lxml-6.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:26dd9f57ee3bd41e7d35b4c98a2ffd89ed11591649f421f0ec19f67d50ec67ac", size = 3817120, upload-time = "2026-04-18T04:32:15.803Z" },
     { url = "https://files.pythonhosted.org/packages/f2/88/55143966481409b1740a3ac669e611055f49efd68087a5ce41582325db3e/lxml-6.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:546b66c0dd1bb8d9fa89d7123e5fa19a8aff3a1f2141eb22df96112afb17b842", size = 3930134, upload-time = "2026-04-18T04:32:35.008Z" },
     { url = "https://files.pythonhosted.org/packages/b5/97/28b985c2983938d3cb696dd5501423afb90a8c3e869ef5d3c62569282c0f/lxml-6.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5cfa1a34df366d9dc0d5eaf420f4cf2bb1e1bebe1066d1c2fc28c179f8a4004c", size = 4210749, upload-time = "2026-04-18T04:36:03.626Z" },
     { url = "https://files.pythonhosted.org/packages/29/67/dfab2b7d58214921935ccea7ce9b3df9b7d46f305d12f0f532ac7cf6b804/lxml-6.1.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db88156fcf544cdbf0d95588051515cfdfd4c876fc66444eb98bceb5d6db76de", size = 4318463, upload-time = "2026-04-18T04:36:06.309Z" },
@@ -3238,28 +3084,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/80/d6/2d1b89f6ca4bff1036499b1e29a1d02d282259f3681540e16563f27ebc23/markupsafe-3.0.3-cp313-cp313t-win32.whl", hash = "sha256:69c0b73548bc525c8cb9a251cddf1931d1db4d2258e9599c28c07ef3580ef354", size = 14612, upload-time = "2025-09-27T18:37:02.639Z" },
     { url = "https://files.pythonhosted.org/packages/2b/98/e48a4bfba0a0ffcf9925fe2d69240bfaa19c6f7507b8cd09c70684a53c1e/markupsafe-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:1b4b79e8ebf6b55351f0d91fe80f893b4743f104bff22e90697db1590e47a218", size = 15200, upload-time = "2025-09-27T18:37:03.582Z" },
     { url = "https://files.pythonhosted.org/packages/0e/72/e3cc540f351f316e9ed0f092757459afbc595824ca724cbc5a5d4263713f/markupsafe-3.0.3-cp313-cp313t-win_arm64.whl", hash = "sha256:ad2cf8aa28b8c020ab2fc8287b0f823d0a7d8630784c31e9ee5edea20f406287", size = 13973, upload-time = "2025-09-27T18:37:04.929Z" },
-    { url = "https://files.pythonhosted.org/packages/33/8a/8e42d4838cd89b7dde187011e97fe6c3af66d8c044997d2183fbd6d31352/markupsafe-3.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:eaa9599de571d72e2daf60164784109f19978b327a3910d3e9de8c97b5b70cfe", size = 11619, upload-time = "2025-09-27T18:37:06.342Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/64/7660f8a4a8e53c924d0fa05dc3a55c9cee10bbd82b11c5afb27d44b096ce/markupsafe-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c47a551199eb8eb2121d4f0f15ae0f923d31350ab9280078d1e5f12b249e0026", size = 12029, upload-time = "2025-09-27T18:37:07.213Z" },
-    { url = "https://files.pythonhosted.org/packages/da/ef/e648bfd021127bef5fa12e1720ffed0c6cbb8310c8d9bea7266337ff06de/markupsafe-3.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f34c41761022dd093b4b6896d4810782ffbabe30f2d443ff5f083e0cbbb8c737", size = 24408, upload-time = "2025-09-27T18:37:09.572Z" },
-    { url = "https://files.pythonhosted.org/packages/41/3c/a36c2450754618e62008bf7435ccb0f88053e07592e6028a34776213d877/markupsafe-3.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:457a69a9577064c05a97c41f4e65148652db078a3a509039e64d3467b9e7ef97", size = 23005, upload-time = "2025-09-27T18:37:10.58Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/20/b7fdf89a8456b099837cd1dc21974632a02a999ec9bf7ca3e490aacd98e7/markupsafe-3.0.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e8afc3f2ccfa24215f8cb28dcf43f0113ac3c37c2f0f0806d8c70e4228c5cf4d", size = 22048, upload-time = "2025-09-27T18:37:11.547Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/a7/591f592afdc734f47db08a75793a55d7fbcc6902a723ae4cfbab61010cc5/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:ec15a59cf5af7be74194f7ab02d0f59a62bdcf1a537677ce67a2537c9b87fcda", size = 23821, upload-time = "2025-09-27T18:37:12.48Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/33/45b24e4f44195b26521bc6f1a82197118f74df348556594bd2262bda1038/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:0eb9ff8191e8498cca014656ae6b8d61f39da5f95b488805da4bb029cccbfbaf", size = 21606, upload-time = "2025-09-27T18:37:13.485Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0e/53dfaca23a69fbfbbf17a4b64072090e70717344c52eaaaa9c5ddff1e5f0/markupsafe-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2713baf880df847f2bece4230d4d094280f4e67b1e813eec43b4c0e144a34ffe", size = 23043, upload-time = "2025-09-27T18:37:14.408Z" },
-    { url = "https://files.pythonhosted.org/packages/46/11/f333a06fc16236d5238bfe74daccbca41459dcd8d1fa952e8fbd5dccfb70/markupsafe-3.0.3-cp314-cp314-win32.whl", hash = "sha256:729586769a26dbceff69f7a7dbbf59ab6572b99d94576a5592625d5b411576b9", size = 14747, upload-time = "2025-09-27T18:37:15.36Z" },
-    { url = "https://files.pythonhosted.org/packages/28/52/182836104b33b444e400b14f797212f720cbc9ed6ba34c800639d154e821/markupsafe-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:bdc919ead48f234740ad807933cdf545180bfbe9342c2bb451556db2ed958581", size = 15341, upload-time = "2025-09-27T18:37:16.496Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/18/acf23e91bd94fd7b3031558b1f013adfa21a8e407a3fdb32745538730382/markupsafe-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:5a7d5dc5140555cf21a6fefbdbf8723f06fcd2f63ef108f2854de715e4422cb4", size = 14073, upload-time = "2025-09-27T18:37:17.476Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/f0/57689aa4076e1b43b15fdfa646b04653969d50cf30c32a102762be2485da/markupsafe-3.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:1353ef0c1b138e1907ae78e2f6c63ff67501122006b0f9abad68fda5f4ffc6ab", size = 11661, upload-time = "2025-09-27T18:37:18.453Z" },
-    { url = "https://files.pythonhosted.org/packages/89/c3/2e67a7ca217c6912985ec766c6393b636fb0c2344443ff9d91404dc4c79f/markupsafe-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1085e7fbddd3be5f89cc898938f42c0b3c711fdcb37d75221de2666af647c175", size = 12069, upload-time = "2025-09-27T18:37:19.332Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/00/be561dce4e6ca66b15276e184ce4b8aec61fe83662cce2f7d72bd3249d28/markupsafe-3.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b52b4fb9df4eb9ae465f8d0c228a00624de2334f216f178a995ccdcf82c4634", size = 25670, upload-time = "2025-09-27T18:37:20.245Z" },
-    { url = "https://files.pythonhosted.org/packages/50/09/c419f6f5a92e5fadde27efd190eca90f05e1261b10dbd8cbcb39cd8ea1dc/markupsafe-3.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fed51ac40f757d41b7c48425901843666a6677e3e8eb0abcff09e4ba6e664f50", size = 23598, upload-time = "2025-09-27T18:37:21.177Z" },
-    { url = "https://files.pythonhosted.org/packages/22/44/a0681611106e0b2921b3033fc19bc53323e0b50bc70cffdd19f7d679bb66/markupsafe-3.0.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f190daf01f13c72eac4efd5c430a8de82489d9cff23c364c3ea822545032993e", size = 23261, upload-time = "2025-09-27T18:37:22.167Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/57/1b0b3f100259dc9fffe780cfb60d4be71375510e435efec3d116b6436d43/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e56b7d45a839a697b5eb268c82a71bd8c7f6c94d6fd50c3d577fa39a9f1409f5", size = 24835, upload-time = "2025-09-27T18:37:23.296Z" },
-    { url = "https://files.pythonhosted.org/packages/26/6a/4bf6d0c97c4920f1597cc14dd720705eca0bf7c787aebc6bb4d1bead5388/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:f3e98bb3798ead92273dc0e5fd0f31ade220f59a266ffd8a4f6065e0a3ce0523", size = 22733, upload-time = "2025-09-27T18:37:24.237Z" },
-    { url = "https://files.pythonhosted.org/packages/14/c7/ca723101509b518797fedc2fdf79ba57f886b4aca8a7d31857ba3ee8281f/markupsafe-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5678211cb9333a6468fb8d8be0305520aa073f50d17f089b5b4b477ea6e67fdc", size = 23672, upload-time = "2025-09-27T18:37:25.271Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/df/5bd7a48c256faecd1d36edc13133e51397e41b73bb77e1a69deab746ebac/markupsafe-3.0.3-cp314-cp314t-win32.whl", hash = "sha256:915c04ba3851909ce68ccc2b8e2cd691618c4dc4c4232fb7982bca3f41fd8c3d", size = 14819, upload-time = "2025-09-27T18:37:26.285Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/8a/0402ba61a2f16038b48b39bccca271134be00c5c9f0f623208399333c448/markupsafe-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4faffd047e07c38848ce017e8725090413cd80cbc23d86e55c587bf979e579c9", size = 15426, upload-time = "2025-09-27T18:37:27.316Z" },
-    { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" },
 ]
 
 [[package]]
@@ -3267,20 +3091,20 @@ name = "mcp"
 version = "1.27.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "httpx", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "httpx-sse", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jsonschema", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pydantic", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pydantic-settings", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pyjwt", extra = ["crypto"], marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "python-multipart", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pywin32", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'" },
-    { name = "sse-starlette", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "starlette", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "typing-inspection", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "uvicorn", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten'" },
+    { name = "anyio", marker = "python_full_version >= '3.11'" },
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
+    { name = "httpx-sse", marker = "python_full_version >= '3.11'" },
+    { name = "jsonschema", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic-settings", marker = "python_full_version >= '3.11'" },
+    { name = "pyjwt", extra = ["crypto"], marker = "python_full_version >= '3.11'" },
+    { name = "python-multipart", marker = "python_full_version >= '3.11'" },
+    { name = "pywin32", marker = "python_full_version >= '3.11' and sys_platform == 'win32'" },
+    { name = "sse-starlette", marker = "python_full_version >= '3.11'" },
+    { name = "starlette", marker = "python_full_version >= '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
+    { name = "typing-inspection", marker = "python_full_version >= '3.11'" },
+    { name = "uvicorn", marker = "python_full_version >= '3.11' and sys_platform != 'emscripten'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/8b/eb/c0cfc62075dc6e1ec1c64d352ae09ac051d9334311ed226f1f425312848a/mcp-1.27.0.tar.gz", hash = "sha256:d3dc35a7eec0d458c1da4976a48f982097ddaab87e278c5511d5a4a56e852b83", size = 607509, upload-time = "2026-04-02T14:48:08.88Z" }
 wheels = [
@@ -3338,15 +3162,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7d/75/5019423fbe1ba00d63b103bb889da8a11463a9d0254cfbf1662963c05a5e/mecab_python3-1.0.12-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:078cb752f1601b1ac6d31c20ae561b3dc3ca96893aa8c486655a41527801a6b1", size = 569093, upload-time = "2025-11-25T11:45:55.402Z" },
     { url = "https://files.pythonhosted.org/packages/3e/38/96f6401b35342831cd96e03c20ae83f53ac25558a5a80cc3b8acb89b6df0/mecab_python3-1.0.12-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34b1dd02bba8f14961539546848f962ecb491517a4b026c46cd9fe393831079d", size = 591296, upload-time = "2025-11-25T11:23:41.209Z" },
     { url = "https://files.pythonhosted.org/packages/e9/ff/6f63a75ea7983c10dd85d81df696efc1b4edf80731757ab1ccb19777325b/mecab_python3-1.0.12-cp313-cp313-win_amd64.whl", hash = "sha256:ad8508d0bf9827b6ad4b3537d144a1ecbbd6a6841b74430c6b74fe4b8766aa42", size = 502666, upload-time = "2025-11-25T11:23:06.447Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/f5/aeb3526cae978264458418f0509638fa1e074f32abdc960f75c6ed33817f/mecab_python3-1.0.12-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:2b9456d1e0b5686d106a1d5839c9e0ac1a05f2e0ff3ff8480119551bd1ca1837", size = 474910, upload-time = "2025-11-25T11:27:22.136Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/15/4907d8d5b3ed99cfc191dade0c086b6f0dabe9c5cf62465612d331a310e0/mecab_python3-1.0.12-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:7f722a5b88aed9fc4a2f927f6fb4ec0095fbe393119a6070d7d46eea05ce8b69", size = 438560, upload-time = "2025-11-25T11:27:23.411Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/58/caddb2bd98ca847e6743d280011146d45b13d8cd8e7469402d5fa297e30e/mecab_python3-1.0.12-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:680ae889c2e9748e980347f21ac04cef30e01624c8b62571c5f999ab6668feea", size = 435311, upload-time = "2025-11-25T11:27:24.85Z" },
-    { url = "https://files.pythonhosted.org/packages/37/36/f65f81b59f53aa3fd5e6e7fc990d8e2e8c89b82439a78a7d8c90561e40d3/mecab_python3-1.0.12-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:019be4838f437b9e4b9cd5e1e615bdcbc2859ed3171fa921f7c6b8fe5222bd25", size = 569531, upload-time = "2025-11-25T11:45:57.087Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/a7/4309ea9ec9b74bad0ac3e1baaccbea592ebdb2ad99ca426a9f47c3ec4716/mecab_python3-1.0.12-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f88232df9af6d1773822ede87eb8cd1896ca234a92225ea0b03638a60cfbf289", size = 590402, upload-time = "2025-11-25T11:23:42.75Z" },
-    { url = "https://files.pythonhosted.org/packages/50/fe/11e6cb53862ca492c17ed45616e0cd07873281abbaccba529647eb0b2117/mecab_python3-1.0.12-cp314-cp314-win_amd64.whl", hash = "sha256:c6e433c43e3a28d98ef0d85e89709f47c43e72f5b570687f1aef50cbaa8d4b45", size = 513871, upload-time = "2025-11-25T11:23:50.545Z" },
-    { url = "https://files.pythonhosted.org/packages/93/11/e09a4f7b457a59320f3757ba9fe8248db08324930ad2af0bbc4e460208d1/mecab_python3-1.0.12-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:c9b8158c9ac13fce0a842f1ef590cabb28c1db09db17192e6bdbecd291ff37fc", size = 476743, upload-time = "2025-11-25T11:27:26.206Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/5d/5688306d4802ac3e7fe5af8cddf3f087fc4d17afd6ffdabfca5b76113822/mecab_python3-1.0.12-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:04342996c8fc214a942dc39c6768f0a8290a2f1b71f7bcfc51414048e504d265", size = 439489, upload-time = "2025-11-25T11:27:27.308Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/b0/7dc0f08233ec70132cd2e577d9f57d43364bb72cb32656d9218a108a388e/mecab_python3-1.0.12-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:06bc3043a21db5b2476479ded1e95bc94d79ae730fb212cb70b103ef89a408c8", size = 436252, upload-time = "2025-11-25T11:27:28.269Z" },
 ]
 
 [[package]]
@@ -3399,24 +3214,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/67/32/f3cd1667028424fa7001d82e10ee35386eea1408b93d399b09fb0aa7875f/msgpack-1.1.2-cp313-cp313-win32.whl", hash = "sha256:a7787d353595c7c7e145e2331abf8b7ff1e6673a6b974ded96e6d4ec09f00c8c", size = 65037, upload-time = "2025-10-08T09:15:21.416Z" },
     { url = "https://files.pythonhosted.org/packages/74/07/1ed8277f8653c40ebc65985180b007879f6a836c525b3885dcc6448ae6cb/msgpack-1.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:a465f0dceb8e13a487e54c07d04ae3ba131c7c5b95e2612596eafde1dccf64a9", size = 72631, upload-time = "2025-10-08T09:15:22.431Z" },
     { url = "https://files.pythonhosted.org/packages/e5/db/0314e4e2db56ebcf450f277904ffd84a7988b9e5da8d0d61ab2d057df2b6/msgpack-1.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:e69b39f8c0aa5ec24b57737ebee40be647035158f14ed4b40e6f150077e21a84", size = 64118, upload-time = "2025-10-08T09:15:23.402Z" },
-    { url = "https://files.pythonhosted.org/packages/22/71/201105712d0a2ff07b7873ed3c220292fb2ea5120603c00c4b634bcdafb3/msgpack-1.1.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e23ce8d5f7aa6ea6d2a2b326b4ba46c985dbb204523759984430db7114f8aa00", size = 81127, upload-time = "2025-10-08T09:15:24.408Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/9f/38ff9e57a2eade7bf9dfee5eae17f39fc0e998658050279cbb14d97d36d9/msgpack-1.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6c15b7d74c939ebe620dd8e559384be806204d73b4f9356320632d783d1f7939", size = 84981, upload-time = "2025-10-08T09:15:25.812Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/a9/3536e385167b88c2cc8f4424c49e28d49a6fc35206d4a8060f136e71f94c/msgpack-1.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99e2cb7b9031568a2a5c73aa077180f93dd2e95b4f8d3b8e14a73ae94a9e667e", size = 411885, upload-time = "2025-10-08T09:15:27.22Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/40/dc34d1a8d5f1e51fc64640b62b191684da52ca469da9cd74e84936ffa4a6/msgpack-1.1.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:180759d89a057eab503cf62eeec0aa61c4ea1200dee709f3a8e9397dbb3b6931", size = 419658, upload-time = "2025-10-08T09:15:28.4Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/ef/2b92e286366500a09a67e03496ee8b8ba00562797a52f3c117aa2b29514b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:04fb995247a6e83830b62f0b07bf36540c213f6eac8e851166d8d86d83cbd014", size = 403290, upload-time = "2025-10-08T09:15:29.764Z" },
-    { url = "https://files.pythonhosted.org/packages/78/90/e0ea7990abea5764e4655b8177aa7c63cdfa89945b6e7641055800f6c16b/msgpack-1.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8e22ab046fa7ede9e36eeb4cfad44d46450f37bb05d5ec482b02868f451c95e2", size = 415234, upload-time = "2025-10-08T09:15:31.022Z" },
-    { url = "https://files.pythonhosted.org/packages/72/4e/9390aed5db983a2310818cd7d3ec0aecad45e1f7007e0cda79c79507bb0d/msgpack-1.1.2-cp314-cp314-win32.whl", hash = "sha256:80a0ff7d4abf5fecb995fcf235d4064b9a9a8a40a3ab80999e6ac1e30b702717", size = 66391, upload-time = "2025-10-08T09:15:32.265Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/f1/abd09c2ae91228c5f3998dbd7f41353def9eac64253de3c8105efa2082f7/msgpack-1.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:9ade919fac6a3e7260b7f64cea89df6bec59104987cbea34d34a2fa15d74310b", size = 73787, upload-time = "2025-10-08T09:15:33.219Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/b0/9d9f667ab48b16ad4115c1935d94023b82b3198064cb84a123e97f7466c1/msgpack-1.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:59415c6076b1e30e563eb732e23b994a61c159cec44deaf584e5cc1dd662f2af", size = 66453, upload-time = "2025-10-08T09:15:34.225Z" },
-    { url = "https://files.pythonhosted.org/packages/16/67/93f80545eb1792b61a217fa7f06d5e5cb9e0055bed867f43e2b8e012e137/msgpack-1.1.2-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:897c478140877e5307760b0ea66e0932738879e7aa68144d9b78ea4c8302a84a", size = 85264, upload-time = "2025-10-08T09:15:35.61Z" },
-    { url = "https://files.pythonhosted.org/packages/87/1c/33c8a24959cf193966ef11a6f6a2995a65eb066bd681fd085afd519a57ce/msgpack-1.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a668204fa43e6d02f89dbe79a30b0d67238d9ec4c5bd8a940fc3a004a47b721b", size = 89076, upload-time = "2025-10-08T09:15:36.619Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/6b/62e85ff7193663fbea5c0254ef32f0c77134b4059f8da89b958beb7696f3/msgpack-1.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5559d03930d3aa0f3aacb4c42c776af1a2ace2611871c84a75afe436695e6245", size = 435242, upload-time = "2025-10-08T09:15:37.647Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/47/5c74ecb4cc277cf09f64e913947871682ffa82b3b93c8dad68083112f412/msgpack-1.1.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:70c5a7a9fea7f036b716191c29047374c10721c389c21e9ffafad04df8c52c90", size = 432509, upload-time = "2025-10-08T09:15:38.794Z" },
-    { url = "https://files.pythonhosted.org/packages/24/a4/e98ccdb56dc4e98c929a3f150de1799831c0a800583cde9fa022fa90602d/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f2cb069d8b981abc72b41aea1c580ce92d57c673ec61af4c500153a626cb9e20", size = 415957, upload-time = "2025-10-08T09:15:40.238Z" },
-    { url = "https://files.pythonhosted.org/packages/da/28/6951f7fb67bc0a4e184a6b38ab71a92d9ba58080b27a77d3e2fb0be5998f/msgpack-1.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d62ce1f483f355f61adb5433ebfd8868c5f078d1a52d042b0a998682b4fa8c27", size = 422910, upload-time = "2025-10-08T09:15:41.505Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/03/42106dcded51f0a0b5284d3ce30a671e7bd3f7318d122b2ead66ad289fed/msgpack-1.1.2-cp314-cp314t-win32.whl", hash = "sha256:1d1418482b1ee984625d88aa9585db570180c286d942da463533b238b98b812b", size = 75197, upload-time = "2025-10-08T09:15:42.954Z" },
-    { url = "https://files.pythonhosted.org/packages/15/86/d0071e94987f8db59d4eeb386ddc64d0bb9b10820a8d82bcd3e53eeb2da6/msgpack-1.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:5a46bf7e831d09470ad92dff02b8b1ac92175ca36b087f904a0519857c6be3ff", size = 85772, upload-time = "2025-10-08T09:15:43.954Z" },
-    { url = "https://files.pythonhosted.org/packages/81/f2/08ace4142eb281c12701fc3b93a10795e4d4dc7f753911d836675050f886/msgpack-1.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d99ef64f349d5ec3293688e91486c5fdb925ed03807f64d98d205d2713c60b46", size = 70868, upload-time = "2025-10-08T09:15:44.959Z" },
 ]
 
 [[package]]
@@ -3518,42 +3315,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ef/a0/f83ae75e42d694b3fbad3e047670e511c138be747bc713cf1b10d5096416/multidict-6.7.0-cp313-cp313t-win32.whl", hash = "sha256:19a1d55338ec1be74ef62440ca9e04a2f001a04d0cc49a4983dc320ff0f3212d", size = 47777, upload-time = "2025-10-06T14:50:47.154Z" },
     { url = "https://files.pythonhosted.org/packages/dc/80/9b174a92814a3830b7357307a792300f42c9e94664b01dee8e457551fa66/multidict-6.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:3da4fb467498df97e986af166b12d01f05d2e04f978a9c1c680ea1988e0bc4b6", size = 53104, upload-time = "2025-10-06T14:50:48.851Z" },
     { url = "https://files.pythonhosted.org/packages/cc/28/04baeaf0428d95bb7a7bea0e691ba2f31394338ba424fb0679a9ed0f4c09/multidict-6.7.0-cp313-cp313t-win_arm64.whl", hash = "sha256:b4121773c49a0776461f4a904cdf6264c88e42218aaa8407e803ca8025872792", size = 45503, upload-time = "2025-10-06T14:50:50.16Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/b1/3da6934455dd4b261d4c72f897e3a5728eba81db59959f3a639245891baa/multidict-6.7.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3bab1e4aff7adaa34410f93b1f8e57c4b36b9af0426a76003f441ee1d3c7e842", size = 75128, upload-time = "2025-10-06T14:50:51.92Z" },
-    { url = "https://files.pythonhosted.org/packages/14/2c/f069cab5b51d175a1a2cb4ccdf7a2c2dabd58aa5bd933fa036a8d15e2404/multidict-6.7.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b8512bac933afc3e45fb2b18da8e59b78d4f408399a960339598374d4ae3b56b", size = 44410, upload-time = "2025-10-06T14:50:53.275Z" },
-    { url = "https://files.pythonhosted.org/packages/42/e2/64bb41266427af6642b6b128e8774ed84c11b80a90702c13ac0a86bb10cc/multidict-6.7.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:79dcf9e477bc65414ebfea98ffd013cb39552b5ecd62908752e0e413d6d06e38", size = 43205, upload-time = "2025-10-06T14:50:54.911Z" },
-    { url = "https://files.pythonhosted.org/packages/02/68/6b086fef8a3f1a8541b9236c594f0c9245617c29841f2e0395d979485cde/multidict-6.7.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:31bae522710064b5cbeddaf2e9f32b1abab70ac6ac91d42572502299e9953128", size = 245084, upload-time = "2025-10-06T14:50:56.369Z" },
-    { url = "https://files.pythonhosted.org/packages/15/ee/f524093232007cd7a75c1d132df70f235cfd590a7c9eaccd7ff422ef4ae8/multidict-6.7.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a0df7ff02397bb63e2fd22af2c87dfa39e8c7f12947bc524dbdc528282c7e34", size = 252667, upload-time = "2025-10-06T14:50:57.991Z" },
-    { url = "https://files.pythonhosted.org/packages/02/a5/eeb3f43ab45878f1895118c3ef157a480db58ede3f248e29b5354139c2c9/multidict-6.7.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:7a0222514e8e4c514660e182d5156a415c13ef0aabbd71682fc714e327b95e99", size = 233590, upload-time = "2025-10-06T14:50:59.589Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/1e/76d02f8270b97269d7e3dbd45644b1785bda457b474315f8cf999525a193/multidict-6.7.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2397ab4daaf2698eb51a76721e98db21ce4f52339e535725de03ea962b5a3202", size = 264112, upload-time = "2025-10-06T14:51:01.183Z" },
-    { url = "https://files.pythonhosted.org/packages/76/0b/c28a70ecb58963847c2a8efe334904cd254812b10e535aefb3bcce513918/multidict-6.7.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8891681594162635948a636c9fe0ff21746aeb3dd5463f6e25d9bea3a8a39ca1", size = 261194, upload-time = "2025-10-06T14:51:02.794Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/63/2ab26e4209773223159b83aa32721b4021ffb08102f8ac7d689c943fded1/multidict-6.7.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18706cc31dbf402a7945916dd5cddf160251b6dab8a2c5f3d6d5a55949f676b3", size = 248510, upload-time = "2025-10-06T14:51:04.724Z" },
-    { url = "https://files.pythonhosted.org/packages/93/cd/06c1fa8282af1d1c46fd55c10a7930af652afdce43999501d4d68664170c/multidict-6.7.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f844a1bbf1d207dd311a56f383f7eda2d0e134921d45751842d8235e7778965d", size = 248395, upload-time = "2025-10-06T14:51:06.306Z" },
-    { url = "https://files.pythonhosted.org/packages/99/ac/82cb419dd6b04ccf9e7e61befc00c77614fc8134362488b553402ecd55ce/multidict-6.7.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:d4393e3581e84e5645506923816b9cc81f5609a778c7e7534054091acc64d1c6", size = 239520, upload-time = "2025-10-06T14:51:08.091Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/f3/a0f9bf09493421bd8716a362e0cd1d244f5a6550f5beffdd6b47e885b331/multidict-6.7.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:fbd18dc82d7bf274b37aa48d664534330af744e03bccf696d6f4c6042e7d19e7", size = 245479, upload-time = "2025-10-06T14:51:10.365Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/01/476d38fc73a212843f43c852b0eee266b6971f0e28329c2184a8df90c376/multidict-6.7.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:b6234e14f9314731ec45c42fc4554b88133ad53a09092cc48a88e771c125dadb", size = 258903, upload-time = "2025-10-06T14:51:12.466Z" },
-    { url = "https://files.pythonhosted.org/packages/49/6d/23faeb0868adba613b817d0e69c5f15531b24d462af8012c4f6de4fa8dc3/multidict-6.7.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:08d4379f9744d8f78d98c8673c06e202ffa88296f009c71bbafe8a6bf847d01f", size = 252333, upload-time = "2025-10-06T14:51:14.48Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/cc/48d02ac22b30fa247f7dad82866e4b1015431092f4ba6ebc7e77596e0b18/multidict-6.7.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9fe04da3f79387f450fd0061d4dd2e45a72749d31bf634aecc9e27f24fdc4b3f", size = 243411, upload-time = "2025-10-06T14:51:16.072Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/03/29a8bf5a18abf1fe34535c88adbdfa88c9fb869b5a3b120692c64abe8284/multidict-6.7.0-cp314-cp314-win32.whl", hash = "sha256:fbafe31d191dfa7c4c51f7a6149c9fb7e914dcf9ffead27dcfd9f1ae382b3885", size = 40940, upload-time = "2025-10-06T14:51:17.544Z" },
-    { url = "https://files.pythonhosted.org/packages/82/16/7ed27b680791b939de138f906d5cf2b4657b0d45ca6f5dd6236fdddafb1a/multidict-6.7.0-cp314-cp314-win_amd64.whl", hash = "sha256:2f67396ec0310764b9222a1728ced1ab638f61aadc6226f17a71dd9324f9a99c", size = 45087, upload-time = "2025-10-06T14:51:18.875Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/3c/e3e62eb35a1950292fe39315d3c89941e30a9d07d5d2df42965ab041da43/multidict-6.7.0-cp314-cp314-win_arm64.whl", hash = "sha256:ba672b26069957ee369cfa7fc180dde1fc6f176eaf1e6beaf61fbebbd3d9c000", size = 42368, upload-time = "2025-10-06T14:51:20.225Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/40/cd499bd0dbc5f1136726db3153042a735fffd0d77268e2ee20d5f33c010f/multidict-6.7.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:c1dcc7524066fa918c6a27d61444d4ee7900ec635779058571f70d042d86ed63", size = 82326, upload-time = "2025-10-06T14:51:21.588Z" },
-    { url = "https://files.pythonhosted.org/packages/13/8a/18e031eca251c8df76daf0288e6790561806e439f5ce99a170b4af30676b/multidict-6.7.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:27e0b36c2d388dc7b6ced3406671b401e84ad7eb0656b8f3a2f46ed0ce483718", size = 48065, upload-time = "2025-10-06T14:51:22.93Z" },
-    { url = "https://files.pythonhosted.org/packages/40/71/5e6701277470a87d234e433fb0a3a7deaf3bcd92566e421e7ae9776319de/multidict-6.7.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2a7baa46a22e77f0988e3b23d4ede5513ebec1929e34ee9495be535662c0dfe2", size = 46475, upload-time = "2025-10-06T14:51:24.352Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/6a/bab00cbab6d9cfb57afe1663318f72ec28289ea03fd4e8236bb78429893a/multidict-6.7.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7bf77f54997a9166a2f5675d1201520586439424c2511723a7312bdb4bcc034e", size = 239324, upload-time = "2025-10-06T14:51:25.822Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/5f/8de95f629fc22a7769ade8b41028e3e5a822c1f8904f618d175945a81ad3/multidict-6.7.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e011555abada53f1578d63389610ac8a5400fc70ce71156b0aa30d326f1a5064", size = 246877, upload-time = "2025-10-06T14:51:27.604Z" },
-    { url = "https://files.pythonhosted.org/packages/23/b4/38881a960458f25b89e9f4a4fdcb02ac101cfa710190db6e5528841e67de/multidict-6.7.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:28b37063541b897fd6a318007373930a75ca6d6ac7c940dbe14731ffdd8d498e", size = 225824, upload-time = "2025-10-06T14:51:29.664Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/39/6566210c83f8a261575f18e7144736059f0c460b362e96e9cf797a24b8e7/multidict-6.7.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:05047ada7a2fde2631a0ed706f1fd68b169a681dfe5e4cf0f8e4cb6618bbc2cd", size = 253558, upload-time = "2025-10-06T14:51:31.684Z" },
-    { url = "https://files.pythonhosted.org/packages/00/a3/67f18315100f64c269f46e6c0319fa87ba68f0f64f2b8e7fd7c72b913a0b/multidict-6.7.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:716133f7d1d946a4e1b91b1756b23c088881e70ff180c24e864c26192ad7534a", size = 252339, upload-time = "2025-10-06T14:51:33.699Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/2a/1cb77266afee2458d82f50da41beba02159b1d6b1f7973afc9a1cad1499b/multidict-6.7.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d1bed1b467ef657f2a0ae62844a607909ef1c6889562de5e1d505f74457d0b96", size = 244895, upload-time = "2025-10-06T14:51:36.189Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/72/09fa7dd487f119b2eb9524946ddd36e2067c08510576d43ff68469563b3b/multidict-6.7.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ca43bdfa5d37bd6aee89d85e1d0831fb86e25541be7e9d376ead1b28974f8e5e", size = 241862, upload-time = "2025-10-06T14:51:41.291Z" },
-    { url = "https://files.pythonhosted.org/packages/65/92/bc1f8bd0853d8669300f732c801974dfc3702c3eeadae2f60cef54dc69d7/multidict-6.7.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:44b546bd3eb645fd26fb949e43c02a25a2e632e2ca21a35e2e132c8105dc8599", size = 232376, upload-time = "2025-10-06T14:51:43.55Z" },
-    { url = "https://files.pythonhosted.org/packages/09/86/ac39399e5cb9d0c2ac8ef6e10a768e4d3bc933ac808d49c41f9dc23337eb/multidict-6.7.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a6ef16328011d3f468e7ebc326f24c1445f001ca1dec335b2f8e66bed3006394", size = 240272, upload-time = "2025-10-06T14:51:45.265Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/b6/fed5ac6b8563ec72df6cb1ea8dac6d17f0a4a1f65045f66b6d3bf1497c02/multidict-6.7.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:5aa873cbc8e593d361ae65c68f85faadd755c3295ea2c12040ee146802f23b38", size = 248774, upload-time = "2025-10-06T14:51:46.836Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/8d/b954d8c0dc132b68f760aefd45870978deec6818897389dace00fcde32ff/multidict-6.7.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:3d7b6ccce016e29df4b7ca819659f516f0bc7a4b3efa3bb2012ba06431b044f9", size = 242731, upload-time = "2025-10-06T14:51:48.541Z" },
-    { url = "https://files.pythonhosted.org/packages/16/9d/a2dac7009125d3540c2f54e194829ea18ac53716c61b655d8ed300120b0f/multidict-6.7.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:171b73bd4ee683d307599b66793ac80981b06f069b62eea1c9e29c9241aa66b0", size = 240193, upload-time = "2025-10-06T14:51:50.355Z" },
-    { url = "https://files.pythonhosted.org/packages/39/ca/c05f144128ea232ae2178b008d5011d4e2cea86e4ee8c85c2631b1b94802/multidict-6.7.0-cp314-cp314t-win32.whl", hash = "sha256:b2d7f80c4e1fd010b07cb26820aae86b7e73b681ee4889684fb8d2d4537aab13", size = 48023, upload-time = "2025-10-06T14:51:51.883Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/8f/0a60e501584145588be1af5cc829265701ba3c35a64aec8e07cbb71d39bb/multidict-6.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:09929cab6fcb68122776d575e03c6cc64ee0b8fca48d17e135474b042ce515cd", size = 53507, upload-time = "2025-10-06T14:51:53.672Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/ae/3148b988a9c6239903e786eac19c889fab607c31d6efa7fb2147e5680f23/multidict-6.7.0-cp314-cp314t-win_arm64.whl", hash = "sha256:cc41db090ed742f32bd2d2c721861725e6109681eddf835d0a82bd3a5c382827", size = 44804, upload-time = "2025-10-06T14:51:55.415Z" },
     { url = "https://files.pythonhosted.org/packages/b7/da/7d22601b625e241d4f23ef1ebff8acfc60da633c9e7e7922e24d10f592b3/multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3", size = 12317, upload-time = "2025-10-06T14:52:29.272Z" },
 ]
 
@@ -3562,25 +3323,15 @@ name = "multiprocess"
 version = "0.70.16"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "dill", version = "0.3.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -3623,6 +3374,54 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478, upload-time = "2025-04-17T03:11:26.253Z" },
 ]
 
+[[package]]
+name = "murmurhash"
+version = "1.0.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/23/2e/88c147931ea9725d634840d538622e94122bceaf346233349b7b5c62964b/murmurhash-1.0.15.tar.gz", hash = "sha256:58e2b27b7847f9e2a6edf10b47a8c8dd70a4705f45dccb7bf76aeadacf56ba01", size = 13291, upload-time = "2025-11-14T09:51:15.272Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/09/3c/5e59e29fe971365d27f191a5cbf8a5fb492746e458604fe5d39810da4668/murmurhash-1.0.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f4989c16053a9a83b02c520dd00a31f0877d5fd2ab8a9b6b75ed9eba0e25c489", size = 27463, upload-time = "2025-11-14T09:49:53.158Z" },
+    { url = "https://files.pythonhosted.org/packages/38/3d/ace00a9b82beaa99a8a7a52e98171cfbf13c0066d2f820e84a5d572e3bd0/murmurhash-1.0.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:899068ba3d7c371e7edd093852c634cce802fefd9aaddfcc0d2fda1d7433c7f9", size = 27714, upload-time = "2025-11-14T09:49:54.855Z" },
+    { url = "https://files.pythonhosted.org/packages/10/0f/34f1c4f97424ea1bc72b1e3bdf61ac34f4c5555ec9163721f1e4cafe5b1d/murmurhash-1.0.15-cp310-cp310-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fe883982114de576c793fd1cf55945c8ee6453ad4c4785ac1a48f84e74fdc650", size = 122570, upload-time = "2025-11-14T09:49:55.977Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/75/0019717a16ce5a7b088fc50a3ecb513035e4196c5e569bf4a2e16bcc0414/murmurhash-1.0.15-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:342277d8d7f712d136507fb3ccdba26c076a34ca0f8d1b96f65f0daa556da2e9", size = 123194, upload-time = "2025-11-14T09:49:57.462Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/a4/c1c95ce60b816c2255098164e424752779269c93f5d6dceaa213346789a2/murmurhash-1.0.15-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bc54facccb32fe1e97d6231edd4f3e2937467c35658b26aa35bbd6a87ebb7cb0", size = 122461, upload-time = "2025-11-14T09:49:58.686Z" },
+    { url = "https://files.pythonhosted.org/packages/63/28/e1f79369a6e8d1a5901346ed2fd3a5c56e647d0b849044870c071cb64e1c/murmurhash-1.0.15-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e525bbd8e26e6b9ab1b56758a59b16c2fffd73bad2f7b8bf361c16f70ff1d980", size = 121676, upload-time = "2025-11-14T09:49:59.888Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/7c/e2be1f5387e5898f6551cf81c4220975858b9dbda4d471b133750945599a/murmurhash-1.0.15-cp310-cp310-win_amd64.whl", hash = "sha256:2224f30f7729717644745a6f513ea7662517dfe7b1867cf1588177f64c61df3c", size = 25156, upload-time = "2025-11-14T09:50:01.016Z" },
+    { url = "https://files.pythonhosted.org/packages/74/07/0df6e1a753de68368662cbbb8f88558e2c877d3886ac12b30953fb8ed335/murmurhash-1.0.15-cp310-cp310-win_arm64.whl", hash = "sha256:8a181494b5f03ba831f9a13f2de3aab9ef591e508e57239043d65c5c592f5837", size = 23270, upload-time = "2025-11-14T09:50:01.99Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/ca/77d3e69924a8eb4508bb4f0ad34e46adbeedeb93616a71080e61e53dad71/murmurhash-1.0.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f32307fb9347680bb4fe1cbef6362fb39bd994f1b59abd8c09ca174e44199081", size = 27397, upload-time = "2025-11-14T09:50:03.077Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/53/a936f577d35b245d47b310f29e5e9f09fcac776c8c992f1ab51a9fb0cee2/murmurhash-1.0.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:539d8405885d1d19c005f3a2313b47e8e54b0ee89915eb8dfbb430b194328e6c", size = 27692, upload-time = "2025-11-14T09:50:04.144Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/64/5f8cfd1fd9cbeb43fcff96672f5bd9e7e1598d1c970f808ecd915490dc20/murmurhash-1.0.15-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c4cd739a00f5a4602201b74568ddabae46ec304719d9be752fd8f534a9464b5e", size = 128396, upload-time = "2025-11-14T09:50:05.268Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/10/d9ce29d559a75db0d8a3f13ea12c7f541ec9de2afca38dc70418b890eedb/murmurhash-1.0.15-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:44d211bcc3ec203c47dac06f48ee871093fcbdffa6652a6cc5ea7180306680a8", size = 128687, upload-time = "2025-11-14T09:50:06.527Z" },
+    { url = "https://files.pythonhosted.org/packages/48/cd/dc97ab7e68cdfa1537a56e36dbc846c5a66701cc39ecee2d4399fe61996c/murmurhash-1.0.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f9bf47101354fb1dc4b2e313192566f04ba295c28a37e2f71c692759acc1ba3c", size = 128198, upload-time = "2025-11-14T09:50:08.062Z" },
+    { url = "https://files.pythonhosted.org/packages/53/73/32f2aaa22c1e4afae337106baf0c938abf36a6cc879cfee83a00461bbbf7/murmurhash-1.0.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3c69b4d3bcd6233782a78907fe10b9b7a796bdc5d28060cf097d067bec280a5d", size = 127214, upload-time = "2025-11-14T09:50:09.265Z" },
+    { url = "https://files.pythonhosted.org/packages/82/ed/812103a7f353eba2d83655b08205e13a38c93b4db0692f94756e1eb44516/murmurhash-1.0.15-cp311-cp311-win_amd64.whl", hash = "sha256:e43a69496342ce530bdd670264cb7c8f45490b296e4764c837ce577e3c7ebd53", size = 25241, upload-time = "2025-11-14T09:50:10.373Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/5f/2c511bdd28f7c24da37a00116ffd0432b65669d098f0d0260c66ac0ffdc2/murmurhash-1.0.15-cp311-cp311-win_arm64.whl", hash = "sha256:f3e99a6ee36ef5372df5f138e3d9c801420776d3641a34a49e5c2555f44edba7", size = 23216, upload-time = "2025-11-14T09:50:11.651Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/46/be8522d3456fdccf1b8b049c6d82e7a3c1114c4fc2cfe14b04cba4b3e701/murmurhash-1.0.15-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d37e3ae44746bca80b1a917c2ea625cf216913564ed43f69d2888e5df97db0cb", size = 27884, upload-time = "2025-11-14T09:50:13.133Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/cc/630449bf4f6178d7daf948ce46ad00b25d279065fc30abd8d706be3d87e0/murmurhash-1.0.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0861cb11039409eaf46878456b7d985ef17b6b484103a6fc367b2ecec846891d", size = 27855, upload-time = "2025-11-14T09:50:14.859Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/30/ea8f601a9bf44db99468696efd59eb9cff1157cd55cb586d67116697583f/murmurhash-1.0.15-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5a301decfaccfec70fe55cb01dde2a012c3014a874542eaa7cc73477bb749616", size = 134088, upload-time = "2025-11-14T09:50:15.958Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/de/c40ce8c0877d406691e735b8d6e9c815f36a82b499d358313db5dbe219d7/murmurhash-1.0.15-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:32c6fde7bd7e9407003370a07b5f4addacabe1556ad3dc2cac246b7a2bba3400", size = 133978, upload-time = "2025-11-14T09:50:17.572Z" },
+    { url = "https://files.pythonhosted.org/packages/47/84/bd49963ecd84ebab2fe66595e2d1ed41d5e8b5153af5dc930f0bd827007c/murmurhash-1.0.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d8b43a7011540dc3c7ce66f2134df9732e2bc3bbb4a35f6458bc755e48bde26", size = 132956, upload-time = "2025-11-14T09:50:18.742Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/7c/2530769c545074417c862583f05f4245644599f1e9ff619b3dfe2969aafc/murmurhash-1.0.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43bf4541892ecd95963fcd307bf1c575fc0fee1682f41c93007adee71ca2bb40", size = 134184, upload-time = "2025-11-14T09:50:19.941Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a4/b249b042f5afe34d14ada2dc4afc777e883c15863296756179652e081c44/murmurhash-1.0.15-cp312-cp312-win_amd64.whl", hash = "sha256:f4ac15a2089dc42e6eb0966622d42d2521590a12c92480aafecf34c085302cca", size = 25647, upload-time = "2025-11-14T09:50:21.049Z" },
+    { url = "https://files.pythonhosted.org/packages/13/bf/028179259aebc18fd4ba5cae2601d1d47517427a537ab44336446431a215/murmurhash-1.0.15-cp312-cp312-win_arm64.whl", hash = "sha256:4a70ca4ae19e600d9be3da64d00710e79dde388a4d162f22078d64844d0ebdda", size = 23338, upload-time = "2025-11-14T09:50:22.359Z" },
+    { url = "https://files.pythonhosted.org/packages/29/2f/ba300b5f04dae0409202d6285668b8a9d3ade43a846abee3ef611cb388d5/murmurhash-1.0.15-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fe50dc70e52786759358fd1471e309b94dddfffb9320d9dfea233c7684c894ba", size = 27861, upload-time = "2025-11-14T09:50:23.804Z" },
+    { url = "https://files.pythonhosted.org/packages/34/02/29c19d268e6f4ea1ed2a462c901eed1ed35b454e2cbc57da592fad663ac6/murmurhash-1.0.15-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1349a7c23f6092e7998ddc5bd28546cc31a595afc61e9fdb3afc423feec3d7ad", size = 27840, upload-time = "2025-11-14T09:50:25.146Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/63/58e2de2b5232cd294c64092688c422196e74f9fa8b3958bdf02d33df24b9/murmurhash-1.0.15-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3ba6d05de2613535b5a9227d4ad8ef40a540465f64660d4a8800634ae10e04f", size = 133080, upload-time = "2025-11-14T09:50:26.566Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/9a/d13e2e9f8ba1ced06840921a50f7cece0a475453284158a3018b72679761/murmurhash-1.0.15-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fa1b70b3cc2801ab44179c65827bbd12009c68b34e9d9ce7125b6a0bd35af63c", size = 132648, upload-time = "2025-11-14T09:50:27.788Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/e1/47994f1813fa205c84977b0ff51ae6709f8539af052c7491a5f863d82bdc/murmurhash-1.0.15-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:213d710fb6f4ef3bc11abbfad0fa94a75ffb675b7dc158c123471e5de869f9af", size = 131502, upload-time = "2025-11-14T09:50:29.339Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/ea/90c1fd00b4aeb704fb5e84cd666b33ffd7f245155048071ffbb51d2bb57d/murmurhash-1.0.15-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b65a5c4e7f5d71f7ccac2d2b60bdf7092d7976270878cfec59d5a66a533db823", size = 132736, upload-time = "2025-11-14T09:50:30.545Z" },
+    { url = "https://files.pythonhosted.org/packages/00/db/da73462dbfa77f6433b128d2120ba7ba300f8c06dc4f4e022c38d240a5f5/murmurhash-1.0.15-cp313-cp313-win_amd64.whl", hash = "sha256:9aba94c5d841e1904cd110e94ceb7f49cfb60a874bbfb27e0373622998fb7c7c", size = 25682, upload-time = "2025-11-14T09:50:31.624Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/83/032729ef14971b938fbef41ee125fc8800020ee229bd35178b6ede8ee934/murmurhash-1.0.15-cp313-cp313-win_arm64.whl", hash = "sha256:263807eca40d08c7b702413e45cca75ecb5883aa337237dc5addb660f1483378", size = 23370, upload-time = "2025-11-14T09:50:33.264Z" },
+    { url = "https://files.pythonhosted.org/packages/10/83/7547d9205e9bd2f8e5dfd0b682cc9277594f98909f228eb359489baec1df/murmurhash-1.0.15-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:694fd42a74b7ce257169d14c24aa616aa6cd4ccf8abe50eca0557e08da99d055", size = 29955, upload-time = "2025-11-14T09:50:34.488Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/c7/3afd5de7a5b3ae07fe2d3a3271b327ee1489c58ba2b2f2159bd31a25edb9/murmurhash-1.0.15-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a2ea4546ba426390beff3cd10db8f0152fdc9072c4f2583ec7d8aa9f3e4ac070", size = 30108, upload-time = "2025-11-14T09:50:35.53Z" },
+    { url = "https://files.pythonhosted.org/packages/02/69/d6637ee67d78ebb2538c00411f28ea5c154886bbe1db16c49435a8a4ab16/murmurhash-1.0.15-cp313-cp313t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:34e5a91139c40b10f98d0b297907f5d5267b4b1b2e5dd2eb74a021824f751b98", size = 164054, upload-time = "2025-11-14T09:50:36.591Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/4c/89e590165b4c7da6bf941441212a721a270195332d3aacfdfdf527d466ca/murmurhash-1.0.15-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:dc35606868a5961cf42e79314ca0bddf5a400ce377b14d83192057928d6252ec", size = 168153, upload-time = "2025-11-14T09:50:37.856Z" },
+    { url = "https://files.pythonhosted.org/packages/07/7a/95c42df0c21d2e413b9fcd17317a7587351daeb264dc29c6aec1fdbd26f8/murmurhash-1.0.15-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:43cc6ac3b91ca0f7a5ae9c063ba4d6c26972c97fd7c25280ecc666413e4c5535", size = 164345, upload-time = "2025-11-14T09:50:39.346Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/22/9d02c880a88b83bb3ce7d6a38fb727373ab78d82e5f3d8d9fc5612219f90/murmurhash-1.0.15-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:847d712136cb462f0e4bd6229ee2d9eb996d8854eb8312dff3d20c8f5181fda5", size = 161990, upload-time = "2025-11-14T09:50:40.689Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/e3/750232524e0dc262e8dcede6536dafc766faadd9a52f1d23746b02948ad8/murmurhash-1.0.15-cp313-cp313t-win_amd64.whl", hash = "sha256:2680851af6901dbe66cc4aa7ef8e263de47e6e1b425ae324caa571bdf18f8d58", size = 28812, upload-time = "2025-11-14T09:50:41.971Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/89/4ad9d215ef6ade89f27a72dc4e86b98ef1a43534cc3e6a6900a362a0bf0a/murmurhash-1.0.15-cp313-cp313t-win_arm64.whl", hash = "sha256:189a8de4d657b5da9efd66601b0636330b08262b3a55431f2379097c986995d0", size = 25398, upload-time = "2025-11-14T09:50:43.023Z" },
+]
+
 [[package]]
 name = "mypy"
 version = "1.19.0"
@@ -3660,12 +3459,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/77/06/b6b8994ce07405f6039701f4b66e9d23f499d0b41c6dd46ec28f96d57ec3/mypy-1.19.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:37af5166f9475872034b56c5efdcf65ee25394e9e1d172907b84577120714364", size = 13593323, upload-time = "2025-11-28T15:46:34.699Z" },
     { url = "https://files.pythonhosted.org/packages/68/b1/126e274484cccdf099a8e328d4fda1c7bdb98a5e888fa6010b00e1bbf330/mypy-1.19.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:510c014b722308c9bd377993bcbf9a07d7e0692e5fa8fc70e639c1eb19fc6bee", size = 13818032, upload-time = "2025-11-28T15:46:18.286Z" },
     { url = "https://files.pythonhosted.org/packages/f8/56/53a8f70f562dfc466c766469133a8a4909f6c0012d83993143f2a9d48d2d/mypy-1.19.0-cp313-cp313-win_amd64.whl", hash = "sha256:cabbee74f29aa9cd3b444ec2f1e4fa5a9d0d746ce7567a6a609e224429781f53", size = 10120644, upload-time = "2025-11-28T15:47:43.99Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/f4/7751f32f56916f7f8c229fe902cbdba3e4dd3f3ea9e8b872be97e7fc546d/mypy-1.19.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f2e36bed3c6d9b5f35d28b63ca4b727cb0228e480826ffc8953d1892ddc8999d", size = 13185236, upload-time = "2025-11-28T15:45:20.696Z" },
-    { url = "https://files.pythonhosted.org/packages/35/31/871a9531f09e78e8d145032355890384f8a5b38c95a2c7732d226b93242e/mypy-1.19.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a18d8abdda14035c5718acb748faec09571432811af129bf0d9e7b2d6699bf18", size = 12213902, upload-time = "2025-11-28T15:46:10.117Z" },
-    { url = "https://files.pythonhosted.org/packages/58/b8/af221910dd40eeefa2077a59107e611550167b9994693fc5926a0b0f87c0/mypy-1.19.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f75e60aca3723a23511948539b0d7ed514dda194bc3755eae0bfc7a6b4887aa7", size = 12738600, upload-time = "2025-11-28T15:44:22.521Z" },
-    { url = "https://files.pythonhosted.org/packages/11/9f/c39e89a3e319c1d9c734dedec1183b2cc3aefbab066ec611619002abb932/mypy-1.19.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8f44f2ae3c58421ee05fe609160343c25f70e3967f6e32792b5a78006a9d850f", size = 13592639, upload-time = "2025-11-28T15:48:08.55Z" },
-    { url = "https://files.pythonhosted.org/packages/97/6d/ffaf5f01f5e284d9033de1267e6c1b8f3783f2cf784465378a86122e884b/mypy-1.19.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:63ea6a00e4bd6822adbfc75b02ab3653a17c02c4347f5bb0cf1d5b9df3a05835", size = 13799132, upload-time = "2025-11-28T15:47:06.032Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/b0/c33921e73aaa0106224e5a34822411bea38046188eb781637f5a5b07e269/mypy-1.19.0-cp314-cp314-win_amd64.whl", hash = "sha256:3ad925b14a0bb99821ff6f734553294aa6a3440a8cb082fe1f5b84dfb662afb1", size = 10269832, upload-time = "2025-11-28T15:47:29.392Z" },
     { url = "https://files.pythonhosted.org/packages/09/0e/fe228ed5aeab470c6f4eb82481837fadb642a5aa95cc8215fd2214822c10/mypy-1.19.0-py3-none-any.whl", hash = "sha256:0c01c99d626380752e527d5ce8e69ffbba2046eb8a060db0329690849cf9b6f9", size = 2469714, upload-time = "2025-11-28T15:45:33.22Z" },
 ]
 
@@ -3698,25 +3491,86 @@ wheels = [
 
 [[package]]
 name = "nemo-curator"
-version = "1.2.0+ac5abfb"
-source = { git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main#ac5abfb5d0049ffbfb9663489cc9fd640d022bc0" }
+version = "1.2.0+ad58743"
+source = { git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main#ad58743668d5a6c324fb1af63225d474f985344c" }
 dependencies = [
-    { name = "absl-py", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "absl-py", marker = "python_full_version >= '3.11'" },
     { name = "comment-parser", version = "1.2.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.13'" },
-    { name = "comment-parser", version = "1.2.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.13.*'" },
-    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "hydra-core", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jieba", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "loguru", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "mecab-python3", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "omegaconf", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "openai", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pandas", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "ray", extra = ["data", "default"], marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "torch", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "transformers", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "comment-parser", version = "1.2.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "hydra-core", marker = "python_full_version >= '3.11'" },
+    { name = "jieba", marker = "python_full_version >= '3.11'" },
+    { name = "loguru", marker = "python_full_version >= '3.11'" },
+    { name = "mecab-python3", marker = "python_full_version >= '3.11'" },
+    { name = "omegaconf", marker = "python_full_version >= '3.11'" },
+    { name = "openai", marker = "python_full_version >= '3.11'" },
+    { name = "pandas", marker = "python_full_version >= '3.11'" },
+    { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "ray", extra = ["data", "default"], marker = "python_full_version >= '3.11'" },
+    { name = "torch", marker = "python_full_version >= '3.11'" },
+    { name = "transformers", marker = "python_full_version >= '3.11'" },
+]
+
+[package.optional-dependencies]
+translation-all = [
+    { name = "aiohttp", marker = "python_full_version >= '3.11'" },
+    { name = "boto3", marker = "python_full_version >= '3.11'" },
+    { name = "google-cloud-translate", marker = "python_full_version >= '3.11'" },
+    { name = "iso639-lang", marker = "python_full_version >= '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "sacrebleu", marker = "python_full_version >= '3.11'" },
+    { name = "spacy", marker = "python_full_version >= '3.11'" },
+]
+
+[[package]]
+name = "nemo-evaluator"
+version = "0.2.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "flask" },
+    { name = "httpx" },
+    { name = "jinja2" },
+    { name = "psutil" },
+    { name = "pydantic" },
+    { name = "pydantic-core" },
+    { name = "pyyaml" },
+    { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "structlog" },
+    { name = "typing-extensions" },
+    { name = "waitress" },
+    { name = "werkzeug" },
+    { name = "yq" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/f6/d9273a828e69c9d01dd5924b5040206ff5480d8228a9048cdeced75ea096/nemo_evaluator-0.2.8.tar.gz", hash = "sha256:3d7b52f41eff5f6ac3c07c95bf3d9c4976322087d82a6e75e7e503ece02a8f26", size = 183827, upload-time = "2026-05-08T06:50:58.853Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/21/a3/7cb0292feaad9eda43c2ec4ff9e574e0e7aba20b06bbe10f7c7ff3a24e88/nemo_evaluator-0.2.8-py3-none-any.whl", hash = "sha256:231c1c03d83fae47be0f18ab099e03c2fabf0955c18d17753a7a0b6aef76868f", size = 230578, upload-time = "2026-05-08T06:50:57.676Z" },
+]
+
+[[package]]
+name = "nemo-evaluator-launcher"
+version = "0.2.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+    { name = "hydra-core" },
+    { name = "jinja2" },
+    { name = "leptonai" },
+    { name = "nemo-evaluator" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "pyyaml" },
+    { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "simple-parsing" },
+    { name = "structlog" },
+    { name = "tabulate" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c1/13/fca3cfe46624e428750954b9197cb599a04a64b43f6c1df2747c73c4f89c/nemo_evaluator_launcher-0.2.5.tar.gz", hash = "sha256:401f9c2a401d7c23cb9ee6bf038e01062b114a0f7a5a76749dd66cc43a6d4b87", size = 242916, upload-time = "2026-04-16T09:49:06.495Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/35/724441efee2434a5b2ce90a08e399eb1d4a77f874508130ba9c8eb94ebfc/nemo_evaluator_launcher-0.2.5-py3-none-any.whl", hash = "sha256:035b2bc32ea083cf3ab8e902e1da963b276b54fa4454bb654b8b247b29574706", size = 309848, upload-time = "2026-04-16T09:49:05.159Z" },
 ]
 
 [[package]]
@@ -3752,8 +3606,8 @@ dependencies = [
     { name = "huggingface-hub" },
     { name = "jinja2" },
     { name = "nemo-run" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "omegaconf" },
     { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pyarrow", version = "22.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3773,20 +3627,20 @@ dependencies = [
 
 [package.optional-dependencies]
 all = [
-    { name = "bcp47", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "cosmos-xenna", version = "0.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
+    { name = "bcp47", marker = "python_full_version >= '3.11'" },
     { name = "cosmos-xenna", version = "0.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "cuda-bindings", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "cuda-python", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "cudf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "cuml-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "data-designer", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "data-designer", marker = "python_full_version >= '3.11'" },
     { name = "gcsfs", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "gcsfs", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "imageio-ffmpeg" },
     { name = "iso639-lang", marker = "python_full_version >= '3.11'" },
-    { name = "nemo-curator", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "nemo-curator", extra = ["translation-all"], marker = "python_full_version >= '3.11'" },
+    { name = "nemo-evaluator-launcher" },
     { name = "pandas", marker = "python_full_version >= '3.11'" },
     { name = "pylibcugraph-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "pylibraft-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
@@ -3806,25 +3660,53 @@ audio = [
     { name = "webdataset" },
 ]
 byob = [
-    { name = "bcp47", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "attrs", marker = "python_full_version >= '3.11'" },
+    { name = "bcp47", marker = "python_full_version >= '3.11'" },
+    { name = "cattrs", marker = "python_full_version >= '3.11'" },
+    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "data-designer", marker = "python_full_version >= '3.11'" },
+    { name = "datasets", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "iso639-lang", marker = "python_full_version >= '3.11'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "loguru", marker = "python_full_version >= '3.11'" },
+    { name = "nemo-curator", extra = ["translation-all"], marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+    { name = "obstore", marker = "python_full_version >= '3.11'" },
+    { name = "pandas", marker = "python_full_version >= '3.11'" },
+    { name = "portpicker", marker = "python_full_version >= '3.11'" },
+    { name = "pulp", marker = "python_full_version >= '3.11'" },
+    { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "sacrebleu", marker = "python_full_version >= '3.11'" },
+    { name = "tabulate", marker = "python_full_version >= '3.11'" },
+    { name = "tqdm", marker = "python_full_version >= '3.11'" },
+    { name = "urllib3" },
+]
+byob-gpu = [
     { name = "cuda-bindings", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "cuda-python", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "cudf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "cuml-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "data-designer", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "iso639-lang", marker = "python_full_version >= '3.11'" },
-    { name = "nemo-curator", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pandas", marker = "python_full_version >= '3.11'" },
+    { name = "cupy-cuda12x", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "pylibcugraph-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "pylibraft-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "raft-dask-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "rapidsmpf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "sacrebleu", marker = "python_full_version >= '3.11'" },
     { name = "scikit-learn", marker = "python_full_version >= '3.11'" },
     { name = "sentence-transformers", marker = "python_full_version >= '3.11'" },
+    { name = "torch" },
+    { name = "transformers", marker = "python_full_version >= '3.11'" },
+]
+curate = [
+    { name = "huggingface-hub" },
+    { name = "nemo-curator", marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
 ]
 data-sdg = [
-    { name = "data-designer", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "data-designer", marker = "python_full_version >= '3.11'" },
 ]
 dev = [
     { name = "mypy" },
@@ -3833,6 +3715,9 @@ dev = [
     { name = "ruff", version = "0.14.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "ruff", version = "0.15.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
+evaluator = [
+    { name = "nemo-evaluator-launcher" },
+]
 gcs = [
     { name = "gcsfs", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "gcsfs", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -3844,13 +3729,27 @@ s3 = [
 sentencepiece = [
     { name = "sentencepiece" },
 ]
+translate = [
+    { name = "attrs", marker = "python_full_version >= '3.11'" },
+    { name = "bcp47", marker = "python_full_version >= '3.11'" },
+    { name = "cattrs", marker = "python_full_version >= '3.11'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "loguru", marker = "python_full_version >= '3.11'" },
+    { name = "nemo-curator", extra = ["translation-all"], marker = "python_full_version >= '3.11'" },
+    { name = "obstore", marker = "python_full_version >= '3.11'" },
+    { name = "portpicker", marker = "python_full_version >= '3.11'" },
+    { name = "pulp", marker = "python_full_version >= '3.11'" },
+    { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
+    { name = "sacrebleu", marker = "python_full_version >= '3.11'" },
+    { name = "tabulate", marker = "python_full_version >= '3.11'" },
+]
 wandb = [
     { name = "wandb" },
 ]
 xenna = [
-    { name = "cosmos-xenna", version = "0.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14'" },
     { name = "cosmos-xenna", version = "0.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "cosmos-xenna", version = "0.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 
 [package.dev-dependencies]
@@ -3875,72 +3774,110 @@ run = [
 
 [package.metadata]
 requires-dist = [
+    { name = "attrs", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=25.4,<26" },
+    { name = "attrs", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=25.4,<26" },
     { name = "bcp47", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'all'", specifier = ">=0.1.0" },
     { name = "bcp47", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'byob'", specifier = ">=0.1.0" },
+    { name = "bcp47", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'translate'", specifier = ">=0.1.0" },
+    { name = "cattrs", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=25.3,<26" },
+    { name = "cattrs", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=25.3,<26" },
     { name = "colorama", specifier = ">=0.4.6" },
+    { name = "cosmos-xenna", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'byob'", specifier = ">=0.2,<0.3" },
     { name = "cosmos-xenna", marker = "extra == 'all'" },
     { name = "cosmos-xenna", marker = "extra == 'xenna'" },
     { name = "cuda-bindings", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = ">=12.9,<13" },
-    { name = "cuda-bindings", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = ">=12.9,<13" },
+    { name = "cuda-bindings", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = ">=12.9,<13" },
     { name = "cuda-python", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = ">=12.9,<13" },
-    { name = "cuda-python", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = ">=12.9,<13" },
+    { name = "cuda-python", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = ">=12.9,<13" },
     { name = "cudf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = "==25.10.*" },
-    { name = "cudf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = "==25.10.*" },
+    { name = "cudf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = "==25.10.*" },
     { name = "cuml-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = "==25.10.*" },
-    { name = "cuml-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = "==25.10.*" },
+    { name = "cuml-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = "==25.10.*" },
+    { name = "cupy-cuda12x", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = ">=14.0,<15" },
     { name = "data-designer", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'all'", specifier = "==0.5.5" },
     { name = "data-designer", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'byob'", specifier = "==0.5.5" },
     { name = "data-designer", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'data-sdg'", specifier = "==0.5.5" },
     { name = "datasets", specifier = ">=2.14.0" },
+    { name = "datasets", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.14.0" },
     { name = "fsspec", specifier = ">=2024.0.0" },
     { name = "gcsfs", marker = "extra == 'all'", specifier = ">=2024.0.0" },
     { name = "gcsfs", marker = "extra == 'gcs'", specifier = ">=2024.0.0" },
     { name = "huggingface-hub", specifier = ">=0.20.0" },
+    { name = "huggingface-hub", marker = "python_full_version >= '3.11' and extra == 'curate'", specifier = ">=0.20.0" },
     { name = "imageio-ffmpeg", marker = "extra == 'all'", specifier = ">=0.5.1" },
     { name = "imageio-ffmpeg", marker = "extra == 'audio'", specifier = ">=0.5.1" },
     { name = "iso639-lang", marker = "python_full_version >= '3.11' and extra == 'all'", specifier = ">=2.6.0,<3.0.0" },
     { name = "iso639-lang", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.6.0,<3.0.0" },
     { name = "jinja2", specifier = ">=3.0.0" },
+    { name = "jinja2", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=3.1,<4" },
+    { name = "jinja2", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=3.1,<4" },
+    { name = "loguru", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=0.7,<1" },
+    { name = "loguru", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=0.7,<1" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.0.0" },
-    { name = "nemo-curator", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'all'", git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main" },
-    { name = "nemo-curator", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'byob'", git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main" },
+    { name = "nemo-curator", marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'curate'", git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main" },
+    { name = "nemo-curator", extras = ["translation-all"], marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'all'", git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main" },
+    { name = "nemo-curator", extras = ["translation-all"], marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'byob'", git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main" },
+    { name = "nemo-curator", extras = ["translation-all"], marker = "python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'translate'", git = "https://github.com/NVIDIA-NeMo/Curator.git?rev=main" },
+    { name = "nemo-evaluator-launcher", marker = "extra == 'all'", specifier = ">=0.1.0" },
+    { name = "nemo-evaluator-launcher", marker = "extra == 'evaluator'", specifier = ">=0.1.0" },
     { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=main" },
     { name = "numpy", specifier = ">=1.24.0" },
+    { name = "numpy", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.2,<3" },
+    { name = "obstore", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=0.8,<0.9" },
+    { name = "obstore", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=0.8,<0.9" },
     { name = "omegaconf", specifier = ">=2.3.0" },
     { name = "pandas", marker = "python_full_version >= '3.11' and extra == 'all'", specifier = ">=2.1.0" },
     { name = "pandas", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.1.0" },
+    { name = "portpicker", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=1.6,<2" },
+    { name = "portpicker", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=1.6,<2" },
+    { name = "pulp", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=3.3,<4" },
+    { name = "pulp", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=3.3,<4" },
     { name = "pyarrow", specifier = ">=14.0.0" },
+    { name = "pyarrow", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=14.0.0" },
+    { name = "pyarrow", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=14.0.0" },
     { name = "pydantic", specifier = ">=2.0.0" },
+    { name = "pydantic", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.0.0" },
     { name = "pydantic-settings", specifier = ">=2.12.0" },
     { name = "pylibcugraph-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = "==25.10.*" },
-    { name = "pylibcugraph-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = "==25.10.*" },
+    { name = "pylibcugraph-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = "==25.10.*" },
     { name = "pylibraft-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = "==25.10.*" },
-    { name = "pylibraft-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = "==25.10.*" },
-    { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" },
+    { name = "pylibraft-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = "==25.10.*" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.3" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pyyaml", specifier = ">=6.0" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=6.0" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11' and extra == 'curate'", specifier = ">=6.0" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=6.0" },
     { name = "raft-dask-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = "==25.10.*" },
-    { name = "raft-dask-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = "==25.10.*" },
+    { name = "raft-dask-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = "==25.10.*" },
     { name = "rapidsmpf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'all'", specifier = "==25.10.*" },
-    { name = "rapidsmpf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob'", specifier = "==25.10.*" },
+    { name = "rapidsmpf-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'byob-gpu'", specifier = "==25.10.*" },
     { name = "ray", extras = ["default", "data"], specifier = ">=2.54" },
+    { name = "requests", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.0.0" },
     { name = "rich", specifier = ">=13.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.1.0" },
     { name = "s3fs", marker = "extra == 'all'", specifier = ">=2024.0.0" },
     { name = "s3fs", marker = "extra == 's3'", specifier = ">=2024.0.0" },
     { name = "sacrebleu", marker = "python_full_version >= '3.11' and extra == 'all'", specifier = ">=2.6.0,<3.0.0" },
     { name = "sacrebleu", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.6.0,<3.0.0" },
+    { name = "sacrebleu", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=2.6.0,<3.0.0" },
     { name = "scikit-learn", marker = "python_full_version >= '3.11' and extra == 'all'", specifier = ">=1.7.0,<1.8.0" },
-    { name = "scikit-learn", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=1.7.0,<1.8.0" },
+    { name = "scikit-learn", marker = "python_full_version >= '3.11' and extra == 'byob-gpu'", specifier = ">=1.7.0,<1.8.0" },
     { name = "sentence-transformers", marker = "python_full_version >= '3.11' and extra == 'all'", specifier = ">=5.0.0,<6.0.0" },
-    { name = "sentence-transformers", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=5.0.0,<6.0.0" },
+    { name = "sentence-transformers", marker = "python_full_version >= '3.11' and extra == 'byob-gpu'", specifier = ">=5.0.0,<6.0.0" },
     { name = "sentencepiece", marker = "extra == 'all'", specifier = ">=0.2.0" },
     { name = "sentencepiece", marker = "extra == 'sentencepiece'", specifier = ">=0.2.0" },
+    { name = "tabulate", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=0.9,<1" },
+    { name = "tabulate", marker = "python_full_version >= '3.11' and extra == 'translate'", specifier = ">=0.9,<1" },
     { name = "textual", specifier = ">=0.70.0" },
     { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0.0" },
     { name = "tomlkit", specifier = ">=0.12.0" },
-    { name = "transformers", specifier = ">=4.36.0" },
+    { name = "torch", marker = "python_full_version >= '3.11' and extra == 'byob-gpu'", specifier = ">=2.10,<2.11" },
+    { name = "tqdm", marker = "python_full_version >= '3.11' and extra == 'byob'" },
+    { name = "transformers", specifier = ">=4.57.6,<5.0" },
+    { name = "transformers", marker = "python_full_version >= '3.11' and extra == 'byob-gpu'", specifier = ">=4.57.6,<5.0" },
     { name = "typer", specifier = ">=0.12.0" },
+    { name = "urllib3", marker = "python_full_version >= '3.11' and extra == 'byob'", specifier = ">=2.7.0,<3" },
     { name = "wandb", specifier = ">=0.23.1" },
     { name = "wandb", marker = "extra == 'all'", specifier = ">=0.15.0" },
     { name = "wandb", marker = "extra == 'wandb'", specifier = ">=0.15.0" },
@@ -3948,10 +3885,10 @@ requires-dist = [
     { name = "webdataset", marker = "extra == 'audio'", specifier = ">=0.2.86" },
     { name = "xxhash", specifier = ">=3.4.0" },
 ]
-provides-extras = ["wandb", "s3", "gcs", "sentencepiece", "xenna", "audio", "data-sdg", "byob", "dev", "all"]
+provides-extras = ["wandb", "s3", "gcs", "sentencepiece", "xenna", "audio", "data-sdg", "byob", "byob-gpu", "translate", "evaluator", "curate", "dev", "all"]
 
 [package.metadata.requires-dev]
-dev = [{ name = "pytest", specifier = ">=9.0.2" }]
+dev = [{ name = "pytest", specifier = ">=9.0.3" }]
 docs = [
     { name = "myst-parser", specifier = ">=4.0.1" },
     { name = "nvidia-sphinx-theme", specifier = ">=0.0.8" },
@@ -3982,25 +3919,15 @@ name = "networkx"
 version = "3.6.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
 wheels = [
@@ -4052,18 +3979,9 @@ name = "numpy"
 version = "2.2.6"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
@@ -4130,13 +4048,12 @@ name = "numpy"
 version = "2.3.5"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" }
 wheels = [
@@ -4144,6 +4061,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/ea/25e26fa5837106cde46ae7d0b667e20f69cbbc0efd64cba8221411ab26ae/numpy-2.3.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218", size = 12528324, upload-time = "2025-11-16T22:49:22.582Z" },
     { url = "https://files.pythonhosted.org/packages/4d/1a/e85f0eea4cf03d6a0228f5c0256b53f2df4bc794706e7df019fc622e47f1/numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d", size = 5356872, upload-time = "2025-11-16T22:49:25.408Z" },
     { url = "https://files.pythonhosted.org/packages/5c/bb/35ef04afd567f4c989c2060cde39211e4ac5357155c1833bcd1166055c61/numpy-2.3.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5", size = 6893148, upload-time = "2025-11-16T22:49:27.549Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/2b/05bbeb06e2dff5eab512dfc678b1cc5ee94d8ac5956a0885c64b6b26252b/numpy-2.3.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7", size = 14557282, upload-time = "2025-11-16T22:49:30.964Z" },
+    { url = "https://files.pythonhosted.org/packages/65/fb/2b23769462b34398d9326081fad5655198fcf18966fcb1f1e49db44fbf31/numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4", size = 16897903, upload-time = "2025-11-16T22:49:34.191Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/14/085f4cf05fc3f1e8aa95e85404e984ffca9b2275a5dc2b1aae18a67538b8/numpy-2.3.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e", size = 16341672, upload-time = "2025-11-16T22:49:37.2Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/3b/1f73994904142b2aa290449b3bb99772477b5fd94d787093e4f24f5af763/numpy-2.3.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748", size = 18838896, upload-time = "2025-11-16T22:49:39.727Z" },
     { url = "https://files.pythonhosted.org/packages/cd/b9/cf6649b2124f288309ffc353070792caf42ad69047dcc60da85ee85fea58/numpy-2.3.5-cp311-cp311-win32.whl", hash = "sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c", size = 6563608, upload-time = "2025-11-16T22:49:42.079Z" },
     { url = "https://files.pythonhosted.org/packages/aa/44/9fe81ae1dcc29c531843852e2874080dc441338574ccc4306b39e2ff6e59/numpy-2.3.5-cp311-cp311-win_amd64.whl", hash = "sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c", size = 13078442, upload-time = "2025-11-16T22:49:43.99Z" },
     { url = "https://files.pythonhosted.org/packages/6d/a7/f99a41553d2da82a20a2f22e93c94f928e4490bb447c9ff3c4ff230581d3/numpy-2.3.5-cp311-cp311-win_arm64.whl", hash = "sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa", size = 10458555, upload-time = "2025-11-16T22:49:47.092Z" },
@@ -4151,6 +4072,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" },
     { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" },
     { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" },
+    { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" },
     { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" },
     { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" },
     { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" },
@@ -4158,6 +4083,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" },
     { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" },
     { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" },
+    { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" },
+    { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" },
     { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" },
     { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" },
     { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" },
@@ -4165,27 +4094,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" },
     { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" },
     { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" },
+    { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" },
+    { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" },
     { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" },
     { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" },
     { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" },
-    { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" },
-    { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" },
-    { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" },
-    { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" },
-    { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" },
-    { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" },
-    { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" },
-    { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" },
     { url = "https://files.pythonhosted.org/packages/c6/65/f9dea8e109371ade9c782b4e4756a82edf9d3366bca495d84d79859a0b79/numpy-2.3.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310", size = 16910689, upload-time = "2025-11-16T22:52:23.247Z" },
     { url = "https://files.pythonhosted.org/packages/00/4f/edb00032a8fb92ec0a679d3830368355da91a69cab6f3e9c21b64d0bb986/numpy-2.3.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c", size = 12457053, upload-time = "2025-11-16T22:52:26.367Z" },
     { url = "https://files.pythonhosted.org/packages/16/a4/e8a53b5abd500a63836a29ebe145fc1ab1f2eefe1cfe59276020373ae0aa/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18", size = 5285635, upload-time = "2025-11-16T22:52:29.266Z" },
     { url = "https://files.pythonhosted.org/packages/a3/2f/37eeb9014d9c8b3e9c55bc599c68263ca44fdbc12a93e45a21d1d56df737/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff", size = 6801770, upload-time = "2025-11-16T22:52:31.421Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/e4/68d2f474df2cb671b2b6c2986a02e520671295647dad82484cde80ca427b/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb", size = 14391768, upload-time = "2025-11-16T22:52:33.593Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/50/94ccd8a2b141cb50651fddd4f6a48874acb3c91c8f0842b08a6afc4b0b21/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7", size = 16729263, upload-time = "2025-11-16T22:52:36.369Z" },
     { url = "https://files.pythonhosted.org/packages/2d/ee/346fa473e666fe14c52fcdd19ec2424157290a032d4c41f98127bfb31ac7/numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425", size = 12967213, upload-time = "2025-11-16T22:52:39.38Z" },
 ]
 
@@ -4242,7 +4163,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.10.2.21"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
@@ -4253,7 +4174,7 @@ name = "nvidia-cufft-cu12"
 version = "11.3.3.83"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
@@ -4280,9 +4201,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.3.90"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-cusparse-cu12", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
@@ -4293,7 +4214,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.8.93"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
@@ -4372,8 +4293,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/d1/08f22448d83481408d663065764ba583df091a7de629ed38fc97e522f1af/nvtx-0.2.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ca8030a6d197952318013dd1c12c22da1d4b9feb76ba72e0fcd449961183c2c", size = 806187, upload-time = "2026-03-18T10:13:32.972Z" },
     { url = "https://files.pythonhosted.org/packages/b1/c0/4a5bb7897918de7c7e0191d9342df8ae4cb797ff07276e0f20d13e497ce7/nvtx-0.2.15-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10749686633f880ad53dcdbb2179fad41b45dcf5b7631d4a1070a577577bd386", size = 782575, upload-time = "2026-03-18T10:13:57.3Z" },
     { url = "https://files.pythonhosted.org/packages/38/56/c7e8645061cc2fc23f3a54f33e1e340df59216f07dcfb97d46b8ae7dd26c/nvtx-0.2.15-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3741edac4678b92f03d22a3f0a2dfd469f422f85e63db71b038e02525b2404ad", size = 788639, upload-time = "2026-03-18T10:12:01.69Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/e1/e02fafc01c18f1868a2d2c030953f49e38d65f2d95884789a6c46ff308f1/nvtx-0.2.15-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3c6d0f27d4f8a2f479eb64a6b842c13aee32120348a1715d995b9bb9f75b35cf", size = 774614, upload-time = "2026-03-18T10:12:46.979Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/7b/c1b96f13ef89bdf2a8c2f326a97bed89699271990d7c8624fda3fedc6e61/nvtx-0.2.15-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:58653bf6fd8453947b9e5153da2ad7aeb0ceafa030de7f133efb3eada5da7ca7", size = 790247, upload-time = "2026-03-18T10:11:39.124Z" },
 ]
 
 [[package]]
@@ -4446,18 +4365,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ce/0c/d2ccb6f32feeca906d5a7c4255340df5262af8838441ca06c9e4e37b67d5/obstore-0.8.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:12c885a9ce5ceb09d13cc186586c0c10b62597eff21b985f6ce8ff9dab963ad3", size = 3773081, upload-time = "2025-09-16T15:33:58.475Z" },
     { url = "https://files.pythonhosted.org/packages/fa/79/40d1cc504cefc89c9b3dd8874287f3fddc7d963a8748d6dffc5880222013/obstore-0.8.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4accc883b93349a81c9931e15dd318cc703b02bbef2805d964724c73d006d00e", size = 3938589, upload-time = "2025-09-16T15:33:59.734Z" },
     { url = "https://files.pythonhosted.org/packages/14/dd/916c6777222db3271e9fb3cf9a97ed92b3a9b3e465bdeec96de9ab809d53/obstore-0.8.2-cp313-cp313-win_amd64.whl", hash = "sha256:ec850adf9980e5788a826ccfd5819989724e2a2f712bfa3258e85966c8d9981e", size = 3977768, upload-time = "2025-09-16T15:34:01.25Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/61/66f8dc98bbf5613bbfe5bf21747b4c8091442977f4bd897945895ab7325c/obstore-0.8.2-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:1431e40e9bb4773a261e51b192ea6489d0799b9d4d7dbdf175cdf813eb8c0503", size = 3623364, upload-time = "2025-09-16T15:34:02.957Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/66/6d527b3027e42f625c8fc816ac7d19b0d6228f95bfe7666e4d6b081d2348/obstore-0.8.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ddb39d4da303f50b959da000aa42734f6da7ac0cc0be2d5a7838b62c97055bb9", size = 3347764, upload-time = "2025-09-16T15:34:04.236Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/79/c00103302b620192ea447a948921ad3fed031ce3d19e989f038e1183f607/obstore-0.8.2-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e01f4e13783db453e17e005a4a3ceff09c41c262e44649ba169d253098c775e8", size = 3460981, upload-time = "2025-09-16T15:34:05.595Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/d9/bfe4ed4b1aebc45b56644dd5b943cf8e1673505cccb352e66878a457e807/obstore-0.8.2-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:df0fc2d0bc17caff9b538564ddc26d7616f7e8b7c65b1a3c90b5048a8ad2e797", size = 3692711, upload-time = "2025-09-16T15:34:06.796Z" },
-    { url = "https://files.pythonhosted.org/packages/13/47/cd6c2cbb18e1f40c77e7957a4a03d2d83f1859a2e876a408f1ece81cad4c/obstore-0.8.2-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e439d06c99a140348f046c9f598ee349cc2dcd9105c15540a4b231f9cc48bbae", size = 3958362, upload-time = "2025-09-16T15:34:08.277Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/ea/5ee82bf23abd71c7d6a3f2d008197ae8f8f569d41314c26a8f75318245be/obstore-0.8.2-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0e37d9046669fcc59522d0faf1d105fcbfd09c84cccaaa1e809227d8e030f32c", size = 3957082, upload-time = "2025-09-16T15:34:09.477Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/ee/46650405e50fdaa8d95f30375491f9c91fac9517980e8a28a4a6af66927f/obstore-0.8.2-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2646fdcc4bbe92dc2bb5bcdff15574da1211f5806c002b66d514cee2a23c7cb8", size = 3775539, upload-time = "2025-09-16T15:34:10.726Z" },
-    { url = "https://files.pythonhosted.org/packages/35/d6/348a7ebebe2ca3d94dfc75344ea19675ae45472823e372c1852844078307/obstore-0.8.2-cp314-cp314-manylinux_2_24_aarch64.whl", hash = "sha256:e31a7d37675056d93dfc244605089dee67f5bba30f37c88436623c8c5ad9ba9d", size = 3535048, upload-time = "2025-09-16T15:34:12.076Z" },
-    { url = "https://files.pythonhosted.org/packages/41/07/b7a16cc0da91a4b902d47880ad24016abfe7880c63f7cdafda45d89a2f91/obstore-0.8.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:656313dd8170dde0f0cd471433283337a63912e8e790a121f7cc7639c83e3816", size = 3699035, upload-time = "2025-09-16T15:34:13.331Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/74/3269a3a58347e0b019742d888612c4b765293c9c75efa44e144b1e884c0d/obstore-0.8.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:329038c9645d6d1741e77fe1a53e28a14b1a5c1461cfe4086082ad39ebabf981", size = 3687307, upload-time = "2025-09-16T15:34:14.501Z" },
-    { url = "https://files.pythonhosted.org/packages/01/f9/4fd4819ad6a49d2f462a45be453561f4caebded0dc40112deeffc34b89b1/obstore-0.8.2-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:1e4df99b369790c97c752d126b286dc86484ea49bff5782843a265221406566f", size = 3776076, upload-time = "2025-09-16T15:34:16.207Z" },
-    { url = "https://files.pythonhosted.org/packages/14/dd/7c4f958fa0b9fc4778fb3d232e38b37db8c6b260f641022fbba48b049d7e/obstore-0.8.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:9e1c65c65e20cc990414a8a9af88209b1bbc0dd9521b5f6b0293c60e19439bb7", size = 3947445, upload-time = "2025-09-16T15:34:17.423Z" },
     { url = "https://files.pythonhosted.org/packages/c3/37/14bae1f5bf4369027abc5315cdba2428ad4c16e2fd3bd5d35b7ee584aa0c/obstore-0.8.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6ea04118980a9c22fc8581225ff4507b6a161baf8949d728d96e68326ebaab59", size = 3624857, upload-time = "2025-09-16T15:34:35.601Z" },
     { url = "https://files.pythonhosted.org/packages/1a/c4/8cba91629aa20479ba86a57c2c2b3bc0a54fc6a31a4594014213603efae6/obstore-0.8.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5f33a7570b6001b54252260fbec18c3f6d21e25d3ec57e9b6c5e7330e8290eb2", size = 3355999, upload-time = "2025-09-16T15:34:36.954Z" },
     { url = "https://files.pythonhosted.org/packages/f2/10/3e40557d6d9c38c5a0f7bac1508209b9dbb8c4da918ddfa9326ba9a1de3f/obstore-0.8.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:11fa78dfb749edcf5a041cd6db20eae95b3e8b09dfdd9b38d14939da40e7c115", size = 3457322, upload-time = "2025-09-16T15:34:38.143Z" },
@@ -4490,14 +4397,14 @@ name = "openai"
 version = "2.33.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "distro", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "httpx", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jiter", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pydantic", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "sniffio", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "tqdm", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "anyio", marker = "python_full_version >= '3.11'" },
+    { name = "distro", marker = "python_full_version >= '3.11'" },
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
+    { name = "jiter", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "sniffio", marker = "python_full_version >= '3.11'" },
+    { name = "tqdm", marker = "python_full_version >= '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f0/ee/d056c82f63c05f06baac0cffb4a90952d8274f90c49dfe244f20497b9bbd/openai-2.33.0.tar.gz", hash = "sha256:f850c435e2a4685bba3295bd54912dd26315d9c1b7733068186134d6e0599f9a", size = 693254, upload-time = "2026-04-28T14:04:42.428Z" }
 wheels = [
@@ -4607,8 +4514,8 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -4649,19 +4556,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
     { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
     { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
-    { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
-    { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
-    { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
-    { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
-    { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
-    { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
-    { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
 ]
 
 [[package]]
@@ -4778,31 +4672,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" },
     { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" },
     { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" },
-    { url = "https://files.pythonhosted.org/packages/54/2a/9a8c6ba2c2c07b71bec92cf63e03370ca5e5f5c5b119b742bcc0cde3f9c5/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:beeae3f27f62308f1ddbcfb0690bf44b10732f2ef43758f169d5e9303165d3f9", size = 4045531, upload-time = "2025-10-15T18:23:10.121Z" },
-    { url = "https://files.pythonhosted.org/packages/84/54/836fdbf1bfb3d66a59f0189ff0b9f5f666cee09c6188309300df04ad71fa/pillow-12.0.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d4827615da15cd59784ce39d3388275ec093ae3ee8d7f0c089b76fa87af756c2", size = 4120554, upload-time = "2025-10-15T18:23:12.14Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/cd/16aec9f0da4793e98e6b54778a5fbce4f375c6646fe662e80600b8797379/pillow-12.0.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:3e42edad50b6909089750e65c91aa09aaf1e0a71310d383f11321b27c224ed8a", size = 3576812, upload-time = "2025-10-15T18:23:13.962Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/b7/13957fda356dc46339298b351cae0d327704986337c3c69bb54628c88155/pillow-12.0.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e5d8efac84c9afcb40914ab49ba063d94f5dbdf5066db4482c66a992f47a3a3b", size = 5252689, upload-time = "2025-10-15T18:23:15.562Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/f5/eae31a306341d8f331f43edb2e9122c7661b975433de5e447939ae61c5da/pillow-12.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:266cd5f2b63ff316d5a1bba46268e603c9caf5606d44f38c2873c380950576ad", size = 4650186, upload-time = "2025-10-15T18:23:17.379Z" },
-    { url = "https://files.pythonhosted.org/packages/86/62/2a88339aa40c4c77e79108facbd307d6091e2c0eb5b8d3cf4977cfca2fe6/pillow-12.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:58eea5ebe51504057dd95c5b77d21700b77615ab0243d8152793dc00eb4faf01", size = 6230308, upload-time = "2025-10-15T18:23:18.971Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/33/5425a8992bcb32d1cb9fa3dd39a89e613d09a22f2c8083b7bf43c455f760/pillow-12.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f13711b1a5ba512d647a0e4ba79280d3a9a045aaf7e0cc6fbe96b91d4cdf6b0c", size = 8039222, upload-time = "2025-10-15T18:23:20.909Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/61/3f5d3b35c5728f37953d3eec5b5f3e77111949523bd2dd7f31a851e50690/pillow-12.0.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6846bd2d116ff42cba6b646edf5bf61d37e5cbd256425fa089fee4ff5c07a99e", size = 6346657, upload-time = "2025-10-15T18:23:23.077Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/be/ee90a3d79271227e0f0a33c453531efd6ed14b2e708596ba5dd9be948da3/pillow-12.0.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c98fa880d695de164b4135a52fd2e9cd7b7c90a9d8ac5e9e443a24a95ef9248e", size = 7038482, upload-time = "2025-10-15T18:23:25.005Z" },
-    { url = "https://files.pythonhosted.org/packages/44/34/a16b6a4d1ad727de390e9bd9f19f5f669e079e5826ec0f329010ddea492f/pillow-12.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa3ed2a29a9e9d2d488b4da81dcb54720ac3104a20bf0bd273f1e4648aff5af9", size = 6461416, upload-time = "2025-10-15T18:23:27.009Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/39/1aa5850d2ade7d7ba9f54e4e4c17077244ff7a2d9e25998c38a29749eb3f/pillow-12.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d034140032870024e6b9892c692fe2968493790dd57208b2c37e3fb35f6df3ab", size = 7131584, upload-time = "2025-10-15T18:23:29.752Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/db/4fae862f8fad0167073a7733973bfa955f47e2cac3dc3e3e6257d10fab4a/pillow-12.0.0-cp314-cp314-win32.whl", hash = "sha256:1b1b133e6e16105f524a8dec491e0586d072948ce15c9b914e41cdadd209052b", size = 6400621, upload-time = "2025-10-15T18:23:32.06Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/24/b350c31543fb0107ab2599464d7e28e6f856027aadda995022e695313d94/pillow-12.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8dc232e39d409036af549c86f24aed8273a40ffa459981146829a324e0848b4b", size = 7142916, upload-time = "2025-10-15T18:23:34.71Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/9b/0ba5a6fd9351793996ef7487c4fdbde8d3f5f75dbedc093bb598648fddf0/pillow-12.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:d52610d51e265a51518692045e372a4c363056130d922a7351429ac9f27e70b0", size = 2523836, upload-time = "2025-10-15T18:23:36.967Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/7a/ceee0840aebc579af529b523d530840338ecf63992395842e54edc805987/pillow-12.0.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1979f4566bb96c1e50a62d9831e2ea2d1211761e5662afc545fa766f996632f6", size = 5255092, upload-time = "2025-10-15T18:23:38.573Z" },
-    { url = "https://files.pythonhosted.org/packages/44/76/20776057b4bfd1aef4eeca992ebde0f53a4dce874f3ae693d0ec90a4f79b/pillow-12.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b2e4b27a6e15b04832fe9bf292b94b5ca156016bbc1ea9c2c20098a0320d6cf6", size = 4653158, upload-time = "2025-10-15T18:23:40.238Z" },
-    { url = "https://files.pythonhosted.org/packages/82/3f/d9ff92ace07be8836b4e7e87e6a4c7a8318d47c2f1463ffcf121fc57d9cb/pillow-12.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fb3096c30df99fd01c7bf8e544f392103d0795b9f98ba71a8054bcbf56b255f1", size = 6267882, upload-time = "2025-10-15T18:23:42.434Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/7a/4f7ff87f00d3ad33ba21af78bfcd2f032107710baf8280e3722ceec28cda/pillow-12.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7438839e9e053ef79f7112c881cef684013855016f928b168b81ed5835f3e75e", size = 8071001, upload-time = "2025-10-15T18:23:44.29Z" },
-    { url = "https://files.pythonhosted.org/packages/75/87/fcea108944a52dad8cca0715ae6247e271eb80459364a98518f1e4f480c1/pillow-12.0.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d5c411a8eaa2299322b647cd932586b1427367fd3184ffbb8f7a219ea2041ca", size = 6380146, upload-time = "2025-10-15T18:23:46.065Z" },
-    { url = "https://files.pythonhosted.org/packages/91/52/0d31b5e571ef5fd111d2978b84603fce26aba1b6092f28e941cb46570745/pillow-12.0.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d7e091d464ac59d2c7ad8e7e08105eaf9dafbc3883fd7265ffccc2baad6ac925", size = 7067344, upload-time = "2025-10-15T18:23:47.898Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/f4/2dd3d721f875f928d48e83bb30a434dee75a2531bca839bb996bb0aa5a91/pillow-12.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:792a2c0be4dcc18af9d4a2dfd8a11a17d5e25274a1062b0ec1c2d79c76f3e7f8", size = 6491864, upload-time = "2025-10-15T18:23:49.607Z" },
-    { url = "https://files.pythonhosted.org/packages/30/4b/667dfcf3d61fc309ba5a15b141845cece5915e39b99c1ceab0f34bf1d124/pillow-12.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:afbefa430092f71a9593a99ab6a4e7538bc9eabbf7bf94f91510d3503943edc4", size = 7158911, upload-time = "2025-10-15T18:23:51.351Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/2f/16cabcc6426c32218ace36bf0d55955e813f2958afddbf1d391849fee9d1/pillow-12.0.0-cp314-cp314t-win32.whl", hash = "sha256:3830c769decf88f1289680a59d4f4c46c72573446352e2befec9a8512104fa52", size = 6408045, upload-time = "2025-10-15T18:23:53.177Z" },
-    { url = "https://files.pythonhosted.org/packages/35/73/e29aa0c9c666cf787628d3f0dcf379f4791fba79f4936d02f8b37165bdf8/pillow-12.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:905b0365b210c73afb0ebe9101a32572152dfd1c144c7e28968a331b9217b94a", size = 7148282, upload-time = "2025-10-15T18:23:55.316Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/70/6b41bdcddf541b437bbb9f47f94d2db5d9ddef6c37ccab8c9107743748a4/pillow-12.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:99353a06902c2e43b43e8ff74ee65a7d90307d82370604746738a1e0661ccca7", size = 2525630, upload-time = "2025-10-15T18:23:57.149Z" },
     { url = "https://files.pythonhosted.org/packages/1d/b3/582327e6c9f86d037b63beebe981425d6811104cb443e8193824ef1a2f27/pillow-12.0.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b22bd8c974942477156be55a768f7aa37c46904c175be4e158b6a86e3a6b7ca8", size = 5215068, upload-time = "2025-10-15T18:23:59.594Z" },
     { url = "https://files.pythonhosted.org/packages/fd/d6/67748211d119f3b6540baf90f92fae73ae51d5217b171b0e8b5f7e5d558f/pillow-12.0.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:805ebf596939e48dbb2e4922a1d3852cfc25c38160751ce02da93058b48d252a", size = 4614994, upload-time = "2025-10-15T18:24:01.669Z" },
     { url = "https://files.pythonhosted.org/packages/2d/e1/f8281e5d844c41872b273b9f2c34a4bf64ca08905668c8ae730eedc7c9fa/pillow-12.0.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cae81479f77420d217def5f54b5b9d279804d17e982e0f2fa19b1d1e14ab5197", size = 5246639, upload-time = "2025-10-15T18:24:03.403Z" },
@@ -4817,25 +4686,15 @@ name = "pillow"
 version = "12.2.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819, upload-time = "2026-04-01T14:46:17.687Z" }
 wheels = [
@@ -4897,31 +4756,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/4b/926ab182c07fccae9fcb120043464e1ff1564775ec8864f21a0ebce6ac25/pillow-12.2.0-cp313-cp313t-win32.whl", hash = "sha256:ee3120ae9dff32f121610bb08e4313be87e03efeadfc6c0d18f89127e24d0c24", size = 6379592, upload-time = "2026-04-01T14:44:40.336Z" },
     { url = "https://files.pythonhosted.org/packages/c2/c4/f9e476451a098181b30050cc4c9a3556b64c02cf6497ea421ac047e89e4b/pillow-12.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:325ca0528c6788d2a6c3d40e3568639398137346c3d6e66bb61db96b96511c98", size = 7085542, upload-time = "2026-04-01T14:44:43.251Z" },
     { url = "https://files.pythonhosted.org/packages/00/a4/285f12aeacbe2d6dc36c407dfbbe9e96d4a80b0fb710a337f6d2ad978c75/pillow-12.2.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e5a76d03a6c6dcef67edabda7a52494afa4035021a79c8558e14af25313d453", size = 2465765, upload-time = "2026-04-01T14:44:45.996Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/98/4595daa2365416a86cb0d495248a393dfc84e96d62ad080c8546256cb9c0/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3adc9215e8be0448ed6e814966ecf3d9952f0ea40eb14e89a102b87f450660d8", size = 4100848, upload-time = "2026-04-01T14:44:48.48Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/79/40184d464cf89f6663e18dfcf7ca21aae2491fff1a16127681bf1fa9b8cf/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:6a9adfc6d24b10f89588096364cc726174118c62130c817c2837c60cf08a392b", size = 4176515, upload-time = "2026-04-01T14:44:51.353Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/63/703f86fd4c422a9cf722833670f4f71418fb116b2853ff7da722ea43f184/pillow-12.2.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:6a6e67ea2e6feda684ed370f9a1c52e7a243631c025ba42149a2cc5934dec295", size = 3640159, upload-time = "2026-04-01T14:44:53.588Z" },
-    { url = "https://files.pythonhosted.org/packages/71/e0/fb22f797187d0be2270f83500aab851536101b254bfa1eae10795709d283/pillow-12.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2bb4a8d594eacdfc59d9e5ad972aa8afdd48d584ffd5f13a937a664c3e7db0ed", size = 5312185, upload-time = "2026-04-01T14:44:56.039Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/8c/1a9e46228571de18f8e28f16fabdfc20212a5d019f3e3303452b3f0a580d/pillow-12.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80b2da48193b2f33ed0c32c38140f9d3186583ce7d516526d462645fd98660ae", size = 4695386, upload-time = "2026-04-01T14:44:58.663Z" },
-    { url = "https://files.pythonhosted.org/packages/70/62/98f6b7f0c88b9addd0e87c217ded307b36be024d4ff8869a812b241d1345/pillow-12.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22db17c68434de69d8ecfc2fe821569195c0c373b25cccb9cbdacf2c6e53c601", size = 6280384, upload-time = "2026-04-01T14:45:01.5Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/03/688747d2e91cfbe0e64f316cd2e8005698f76ada3130d0194664174fa5de/pillow-12.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b14cc0106cd9aecda615dd6903840a058b4700fcb817687d0ee4fc8b6e389be", size = 8091599, upload-time = "2026-04-01T14:45:04.5Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/35/577e22b936fcdd66537329b33af0b4ccfefaeabd8aec04b266528cddb33c/pillow-12.2.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cbeb542b2ebc6fcdacabf8aca8c1a97c9b3ad3927d46b8723f9d4f033288a0f", size = 6396021, upload-time = "2026-04-01T14:45:07.117Z" },
-    { url = "https://files.pythonhosted.org/packages/11/8d/d2532ad2a603ca2b93ad9f5135732124e57811d0168155852f37fbce2458/pillow-12.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bfd07bc812fbd20395212969e41931001fd59eb55a60658b0e5710872e95286", size = 7083360, upload-time = "2026-04-01T14:45:09.763Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/26/d325f9f56c7e039034897e7380e9cc202b1e368bfd04d4cbe6a441f02885/pillow-12.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9aba9a17b623ef750a4d11b742cbafffeb48a869821252b30ee21b5e91392c50", size = 6507628, upload-time = "2026-04-01T14:45:12.378Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/f7/769d5632ffb0988f1c5e7660b3e731e30f7f8ec4318e94d0a5d674eb65a4/pillow-12.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:deede7c263feb25dba4e82ea23058a235dcc2fe1f6021025dc71f2b618e26104", size = 7209321, upload-time = "2026-04-01T14:45:15.122Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/7a/c253e3c645cd47f1aceea6a8bacdba9991bf45bb7dfe927f7c893e89c93c/pillow-12.2.0-cp314-cp314-win32.whl", hash = "sha256:632ff19b2778e43162304d50da0181ce24ac5bb8180122cbe1bf4673428328c7", size = 6479723, upload-time = "2026-04-01T14:45:17.797Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/8b/601e6566b957ca50e28725cb6c355c59c2c8609751efbecd980db44e0349/pillow-12.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:4e6c62e9d237e9b65fac06857d511e90d8461a32adcc1b9065ea0c0fa3a28150", size = 7217400, upload-time = "2026-04-01T14:45:20.529Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/94/220e46c73065c3e2951bb91c11a1fb636c8c9ad427ac3ce7d7f3359b9b2f/pillow-12.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:b1c1fbd8a5a1af3412a0810d060a78b5136ec0836c8a4ef9aa11807f2a22f4e1", size = 2554835, upload-time = "2026-04-01T14:45:23.162Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/ab/1b426a3974cb0e7da5c29ccff4807871d48110933a57207b5a676cccc155/pillow-12.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:57850958fe9c751670e49b2cecf6294acc99e562531f4bd317fa5ddee2068463", size = 5314225, upload-time = "2026-04-01T14:45:25.637Z" },
-    { url = "https://files.pythonhosted.org/packages/19/1e/dce46f371be2438eecfee2a1960ee2a243bbe5e961890146d2dee1ff0f12/pillow-12.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5d38f1411c0ed9f97bcb49b7bd59b6b7c314e0e27420e34d99d844b9ce3b6f3", size = 4698541, upload-time = "2026-04-01T14:45:28.355Z" },
-    { url = "https://files.pythonhosted.org/packages/55/c3/7fbecf70adb3a0c33b77a300dc52e424dc22ad8cdc06557a2e49523b703d/pillow-12.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c0a9f29ca8e79f09de89293f82fc9b0270bb4af1d58bc98f540cc4aedf03166", size = 6322251, upload-time = "2026-04-01T14:45:30.924Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/3c/7fbc17cfb7e4fe0ef1642e0abc17fc6c94c9f7a16be41498e12e2ba60408/pillow-12.2.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1610dd6c61621ae1cf811bef44d77e149ce3f7b95afe66a4512f8c59f25d9ebe", size = 8127807, upload-time = "2026-04-01T14:45:33.908Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/c3/a8ae14d6defd2e448493ff512fae903b1e9bd40b72efb6ec55ce0048c8ce/pillow-12.2.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a34329707af4f73cf1782a36cd2289c0368880654a2c11f027bcee9052d35dd", size = 6433935, upload-time = "2026-04-01T14:45:36.623Z" },
-    { url = "https://files.pythonhosted.org/packages/6e/32/2880fb3a074847ac159d8f902cb43278a61e85f681661e7419e6596803ed/pillow-12.2.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e9c4f5b3c546fa3458a29ab22646c1c6c787ea8f5ef51300e5a60300736905e", size = 7116720, upload-time = "2026-04-01T14:45:39.258Z" },
-    { url = "https://files.pythonhosted.org/packages/46/87/495cc9c30e0129501643f24d320076f4cc54f718341df18cc70ec94c44e1/pillow-12.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fb043ee2f06b41473269765c2feae53fc2e2fbf96e5e22ca94fb5ad677856f06", size = 6540498, upload-time = "2026-04-01T14:45:41.879Z" },
-    { url = "https://files.pythonhosted.org/packages/18/53/773f5edca692009d883a72211b60fdaf8871cbef075eaa9d577f0a2f989e/pillow-12.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f278f034eb75b4e8a13a54a876cc4a5ab39173d2cdd93a638e1b467fc545ac43", size = 7239413, upload-time = "2026-04-01T14:45:44.705Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/e4/4b64a97d71b2a83158134abbb2f5bd3f8a2ea691361282f010998f339ec7/pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354", size = 6482084, upload-time = "2026-04-01T14:45:47.568Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/13/306d275efd3a3453f72114b7431c877d10b1154014c1ebbedd067770d629/pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1", size = 7225152, upload-time = "2026-04-01T14:45:50.032Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/6e/cf826fae916b8658848d7b9f38d88da6396895c676e8086fc0988073aaf8/pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb", size = 2556579, upload-time = "2026-04-01T14:45:52.529Z" },
     { url = "https://files.pythonhosted.org/packages/4e/b7/2437044fb910f499610356d1352e3423753c98e34f915252aafecc64889f/pillow-12.2.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538bd5e05efec03ae613fd89c4ce0368ecd2ba239cc25b9f9be7ed426b0af1f", size = 5273969, upload-time = "2026-04-01T14:45:55.538Z" },
     { url = "https://files.pythonhosted.org/packages/f6/f4/8316e31de11b780f4ac08ef3654a75555e624a98db1056ecb2122d008d5a/pillow-12.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:394167b21da716608eac917c60aa9b969421b5dcbbe02ae7f013e7b85811c69d", size = 4659674, upload-time = "2026-04-01T14:45:58.093Z" },
     { url = "https://files.pythonhosted.org/packages/d4/37/664fca7201f8bb2aa1d20e2c3d5564a62e6ae5111741966c8319ca802361/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d04bfa02cc2d23b497d1e90a0f927070043f6cbf303e738300532379a4b4e0f", size = 5288479, upload-time = "2026-04-01T14:46:01.141Z" },
@@ -4966,13 +4800,57 @@ name = "portpicker"
 version = "1.6.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "psutil", marker = "python_full_version < '3.14'" },
+    { name = "psutil" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4d/d0/cda2fc582f09510c84cd6b7d7b9e22a02d4e45dbad2b2ef1c6edd7847e00/portpicker-1.6.0.tar.gz", hash = "sha256:bd507fd6f96f65ee02781f2e674e9dc6c99bbfa6e3c39992e3916204c9d431fa", size = 25676, upload-time = "2023-08-15T04:37:08.865Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/32/2d/440e4d7041fff89f28f483733eb617127aa866135c2dc719e05893f089e1/portpicker-1.6.0-py3-none-any.whl", hash = "sha256:b2787a41404cf7edbe29b07b9e0ed863b09f2665dcc01c1eb0c2261c1e7d0755", size = 16613, upload-time = "2023-08-15T04:37:07.327Z" },
 ]
 
+[[package]]
+name = "preshed"
+version = "3.0.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cymem", marker = "python_full_version >= '3.11'" },
+    { name = "murmurhash", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/75/fe6b7bbd0dea530a001b0e24c331b21a0be2786e402abf3c57f5dce43d4b/preshed-3.0.13.tar.gz", hash = "sha256:d75f718bbfd97e992f7827e0fa7faf6a91bdd9c922d5baa4b50d62731396cb89", size = 18338, upload-time = "2026-03-23T08:57:31.378Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/7e/d55d8cdeefa78995eec15a11ae16cbd0581a0be2342527a64251fd948cef/preshed-3.0.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:42c58b07e8b431e33d0ad9922e896632453821cad8b09171b619b8c61101916f", size = 136920, upload-time = "2026-03-23T08:56:10.829Z" },
+    { url = "https://files.pythonhosted.org/packages/10/bc/ee1f388a97c613e656d774b522b4ddc1cd32e984ca4eb1157c5d822e9011/preshed-3.0.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a06e27f4e5b9d7943840087828c6a0dae4a3475576d12c2e95b71abbb325a80b", size = 137576, upload-time = "2026-03-23T08:56:12.441Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/dd/24c5a576035df4043998e1069718dd7369e107ce9d169df2333d00461dbf/preshed-3.0.13-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b82d7a7bb63d248a6cbbfcabb4a570c993d54d964e39dc5d85c14018ba2079e", size = 780270, upload-time = "2026-03-23T08:56:14.108Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/ab/fb0f6808fffad96c962ce254587cae2bb7df0fda3e6d6b481ce4f60f6c2d/preshed-3.0.13-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef84b225d226af43adfee78ce5ddede72a6155ce5292c1a41dcd1f0b9c87c30", size = 779722, upload-time = "2026-03-23T08:56:15.721Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/7f/c9948dde95bf965c6af2c31f0dbbc6c7e5433b5de1c85f20644edf38c78c/preshed-3.0.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:70d502e081348df207d90f347f21770ed596822bb04eb3c3b32b7281579e90c6", size = 1775435, upload-time = "2026-03-23T08:56:17.655Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/57/8db29ac57b981ef19d1078001aa6c2055a3eed46998c1c93f3d1fdb86106/preshed-3.0.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:985cb9b097beda76cd13c01a0499707103e8915f888fa30f8aa8324ef2cc6b08", size = 1842612, upload-time = "2026-03-23T08:56:19.755Z" },
+    { url = "https://files.pythonhosted.org/packages/38/e5/ead05efc423be237fba76a3bf0eeb492e5d3801504c096c3552c517f2f72/preshed-3.0.13-cp310-cp310-win_amd64.whl", hash = "sha256:867aa73abbf4ee3b4d7662148091c33a8c039271269e3a7f1e0ca995f91995c8", size = 121951, upload-time = "2026-03-23T08:56:21.138Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/80/9cf7f7c208046c97d4b2765f89545a6ea8cfefbd87f0141dde61e6f098ac/preshed-3.0.13-cp310-cp310-win_arm64.whl", hash = "sha256:2b704e46cb7b88f656ef16a3e5347b36525a1c53721d327a4ba1457404101f85", size = 109604, upload-time = "2026-03-23T08:56:22.537Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/d1/7bc39738388b38ff48cecbb326a9b2bb3f422bb32097be92e010f3162395/preshed-3.0.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5268c0e6fa96f50cdf87f516c2d4b32563c12706ee768e75c00e8d0098acd545", size = 136718, upload-time = "2026-03-23T08:56:23.889Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/65/de465b6801740140c2b5d2db6c312ca7937dcfd0442f1ae7d50dee529544/preshed-3.0.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:df642547a1a94079978a0ea8f4593ab4b8d3bd43f767bef0ef64d9a214f8c4c9", size = 137261, upload-time = "2026-03-23T08:56:25.303Z" },
+    { url = "https://files.pythonhosted.org/packages/89/83/478ee078746a4a413c841542caebd2ea74b659475b8bf5f2e3724b6fe655/preshed-3.0.13-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:09397592d333a77f88454e72b7f1f941b2afaf040b392b9e74898dbc4648cdf5", size = 821010, upload-time = "2026-03-23T08:56:26.455Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/2e/1ac761e973966893cd3a0ad3256360365276e2d1e779e351448981a1156a/preshed-3.0.13-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f8e6fe0620ed0f96a246d46447055c447e071cd8222731a045c235e8a758c918", size = 823096, upload-time = "2026-03-23T08:56:28.126Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/51/7824cfd85dd7fe547888de20228ebd87d9acd3708206d30b82211e382d23/preshed-3.0.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:502f93f49a22788203f02d3067d4ea077a0cca3864de6a792eae12e7ce589e14", size = 1812148, upload-time = "2026-03-23T08:56:29.755Z" },
+    { url = "https://files.pythonhosted.org/packages/34/48/32160a24705d56179de6af838c10a0c735c955dae5f9e4bb344750b79bc2/preshed-3.0.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:acd4d89abeca3678c5d8c89b3cd351314465bc67c7fa053d2644f8513e543386", size = 1881154, upload-time = "2026-03-23T08:56:31.49Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/22/0344b50f8b1ad9e3aac08099c47e1aba91c81602fd117d2673f6606ecae6/preshed-3.0.13-cp311-cp311-win_amd64.whl", hash = "sha256:de87fbabb0f37c3c92d4dd9b94fc82ab73cdab4247cdfbd57ab3926caa983919", size = 122219, upload-time = "2026-03-23T08:56:32.74Z" },
+    { url = "https://files.pythonhosted.org/packages/33/c4/812eeaa568510f396e27edab01100ca71418f032fd7098b107f12e572361/preshed-3.0.13-cp311-cp311-win_arm64.whl", hash = "sha256:5e2753779832e411e93eb727f3d409c0a6b7408e5ce4dd868076d8ece48c7693", size = 109308, upload-time = "2026-03-23T08:56:33.839Z" },
+    { url = "https://files.pythonhosted.org/packages/39/fb/ccff23c44c04088c248539005fcda78b9014512a34d170c5360f02ad908b/preshed-3.0.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5d14eea14bd01291388928991d7df7d60b9fd19ae970e55006eb4d29b0c1e8eb", size = 138497, upload-time = "2026-03-23T08:56:35.321Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/ce/cad5a8145881a771e6c0d002f2e585fc19b962f120860b54d32af5baa342/preshed-3.0.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f05b08ce92399c0655b5e0eb5a1cc1f9e295703ed3aabdfaf6538dfa8ae23d57", size = 138010, upload-time = "2026-03-23T08:56:36.399Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/a2/c5fed4fb3e946699259d11e4036a3cfdd8c89b3e542e3077d46781642425/preshed-3.0.13-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62cf7f3113132891d6bba70ff547ad81c6fe50a31930bbbb8499f1d47cd122b7", size = 861498, upload-time = "2026-03-23T08:56:37.67Z" },
+    { url = "https://files.pythonhosted.org/packages/51/94/8c9bc48a6ea4903f53a1a0031ce8e35687526949f25821762ef21493c007/preshed-3.0.13-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b8de3f58043070a354477995acdd98626ce43e4193c708ebd0f694e467f5155", size = 868988, upload-time = "2026-03-23T08:56:39.324Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/df/ecd2f40055ff52527ca117ffbfafb888c1a3079b59fbabe03c5b8f9b7240/preshed-3.0.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:183b339956a9e1d7a4a00038a3b9587a734db9e8bd915939a49791bd1b372156", size = 1847382, upload-time = "2026-03-23T08:56:40.89Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/88/bdb244e40284ded3632a9f88c23bc80230bd7b2ae4a8b7f2cc91adead7a8/preshed-3.0.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2e77bed56aded7cbe5d28d6bd2178bc5b13eda0e0e464dab205fb578fa915000", size = 1919236, upload-time = "2026-03-23T08:56:42.616Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c9/c91ea56342e6c364fc69b444a1ac5432327857199c44032c9cc9dc4c3a23/preshed-3.0.13-cp312-cp312-win_amd64.whl", hash = "sha256:04d8f13f2986e5d11af5ac51f55ce3106c70c41b483d20ea392e6180bdd0f870", size = 122938, upload-time = "2026-03-23T08:56:44.271Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/0b/6a99d99619fd83b14c696e2489caed7070647488d4d3ac0b723d35db2de0/preshed-3.0.13-cp312-cp312-win_arm64.whl", hash = "sha256:19318dc1cd8cac6663c6c830bf7e0002d2de853769fb03e056774e97c21bedfd", size = 109194, upload-time = "2026-03-23T08:56:45.346Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/2a/401158195d6dc7f6aef0b354d74d0e95c9da124499448c2b3dbb95b71204/preshed-3.0.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c0d0c14187dc0078d8a63bf190ec045a4d13e7748b6caeb557a7d575e411410b", size = 137289, upload-time = "2026-03-23T08:56:46.516Z" },
+    { url = "https://files.pythonhosted.org/packages/88/8f/e20e64573988528785447a6893b2e7ab287ecfd85b3888e978b28812fd20/preshed-3.0.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7770987c2e57497cd26124a9be5f652b5b3ccd0def89859ab0da8bca6144a3de", size = 136847, upload-time = "2026-03-23T08:56:47.572Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/72/18168f881359c4482d312f8dc196371bdd61c1583a52b34390da4c88bbea/preshed-3.0.13-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4a7bc48220de579be6bdb0a8715482cf36e2a625a6fd5ad26c9f43485a4a23b5", size = 831478, upload-time = "2026-03-23T08:56:48.769Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/3a/3543476091087102775568cea9885dde3453569e9aeee365809108de572f/preshed-3.0.13-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e5c8462472f790c16708306aef3a102a762bd19dfe3d2f8ee08bd5e12f51b835", size = 839913, upload-time = "2026-03-23T08:56:49.937Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/65/b13f01329decc44ef53cfb6b4601ba85382dcb2a4ec78d9250f03a418066/preshed-3.0.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c046736239cc8d72670749b79b526e4111839a2fc461a58545d212797649129c", size = 1816452, upload-time = "2026-03-23T08:56:51.233Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/c7/f1a996c6832234efd4d543041b582418d41ac480ee55c557ec9e65344637/preshed-3.0.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7c333f18e9a81c8a6de0603fd8781e17115324b117c445ca91abdf7bfb1abe49", size = 1888978, upload-time = "2026-03-23T08:56:52.591Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/b9/96fb71499049885ce19545903fdd38877bbc2be0da47e37c04d01f3e9f66/preshed-3.0.13-cp313-cp313-win_amd64.whl", hash = "sha256:461327f8dd36520dcf1fd55a671e0c3c2c97a2d95e22fc85faa31173f4785dda", size = 122134, upload-time = "2026-03-23T08:56:54.392Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/a7/32a4903019d936a2316fdd330bedddac287ac26326107d24fb76a1fbc60a/preshed-3.0.13-cp313-cp313-win_arm64.whl", hash = "sha256:35d6c5acb3ee3b12b87a551913063f0cec784055c2af16e028c19fe875f079d0", size = 108497, upload-time = "2026-03-23T08:56:55.816Z" },
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.23.1"
@@ -5088,36 +4966,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/92/f7/1d4ec5841505f423469efbfc381d64b7b467438cd5a4bbcbb063f3b73d27/propcache-0.4.1-cp313-cp313t-win32.whl", hash = "sha256:2ad890caa1d928c7c2965b48f3a3815c853180831d0e5503d35cf00c472f4717", size = 41396, upload-time = "2025-10-08T19:47:47.202Z" },
     { url = "https://files.pythonhosted.org/packages/48/f0/615c30622316496d2cbbc29f5985f7777d3ada70f23370608c1d3e081c1f/propcache-0.4.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f7ee0e597f495cf415bcbd3da3caa3bd7e816b74d0d52b8145954c5e6fd3ff37", size = 44897, upload-time = "2025-10-08T19:47:48.336Z" },
     { url = "https://files.pythonhosted.org/packages/fd/ca/6002e46eccbe0e33dcd4069ef32f7f1c9e243736e07adca37ae8c4830ec3/propcache-0.4.1-cp313-cp313t-win_arm64.whl", hash = "sha256:929d7cbe1f01bb7baffb33dc14eb5691c95831450a26354cd210a8155170c93a", size = 39789, upload-time = "2025-10-08T19:47:49.876Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/5c/bca52d654a896f831b8256683457ceddd490ec18d9ec50e97dfd8fc726a8/propcache-0.4.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3f7124c9d820ba5548d431afb4632301acf965db49e666aa21c305cbe8c6de12", size = 78152, upload-time = "2025-10-08T19:47:51.051Z" },
-    { url = "https://files.pythonhosted.org/packages/65/9b/03b04e7d82a5f54fb16113d839f5ea1ede58a61e90edf515f6577c66fa8f/propcache-0.4.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c0d4b719b7da33599dfe3b22d3db1ef789210a0597bc650b7cee9c77c2be8c5c", size = 44869, upload-time = "2025-10-08T19:47:52.594Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/fa/89a8ef0468d5833a23fff277b143d0573897cf75bd56670a6d28126c7d68/propcache-0.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9f302f4783709a78240ebc311b793f123328716a60911d667e0c036bc5dcbded", size = 46596, upload-time = "2025-10-08T19:47:54.073Z" },
-    { url = "https://files.pythonhosted.org/packages/86/bd/47816020d337f4a746edc42fe8d53669965138f39ee117414c7d7a340cfe/propcache-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c80ee5802e3fb9ea37938e7eecc307fb984837091d5fd262bb37238b1ae97641", size = 206981, upload-time = "2025-10-08T19:47:55.715Z" },
-    { url = "https://files.pythonhosted.org/packages/df/f6/c5fa1357cc9748510ee55f37173eb31bfde6d94e98ccd9e6f033f2fc06e1/propcache-0.4.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ed5a841e8bb29a55fb8159ed526b26adc5bdd7e8bd7bf793ce647cb08656cdf4", size = 211490, upload-time = "2025-10-08T19:47:57.499Z" },
-    { url = "https://files.pythonhosted.org/packages/80/1e/e5889652a7c4a3846683401a48f0f2e5083ce0ec1a8a5221d8058fbd1adf/propcache-0.4.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:55c72fd6ea2da4c318e74ffdf93c4fe4e926051133657459131a95c846d16d44", size = 215371, upload-time = "2025-10-08T19:47:59.317Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/f2/889ad4b2408f72fe1a4f6a19491177b30ea7bf1a0fd5f17050ca08cfc882/propcache-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8326e144341460402713f91df60ade3c999d601e7eb5ff8f6f7862d54de0610d", size = 201424, upload-time = "2025-10-08T19:48:00.67Z" },
-    { url = "https://files.pythonhosted.org/packages/27/73/033d63069b57b0812c8bd19f311faebeceb6ba31b8f32b73432d12a0b826/propcache-0.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:060b16ae65bc098da7f6d25bf359f1f31f688384858204fe5d652979e0015e5b", size = 197566, upload-time = "2025-10-08T19:48:02.604Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/89/ce24f3dc182630b4e07aa6d15f0ff4b14ed4b9955fae95a0b54c58d66c05/propcache-0.4.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:89eb3fa9524f7bec9de6e83cf3faed9d79bffa560672c118a96a171a6f55831e", size = 193130, upload-time = "2025-10-08T19:48:04.499Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/24/ef0d5fd1a811fb5c609278d0209c9f10c35f20581fcc16f818da959fc5b4/propcache-0.4.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:dee69d7015dc235f526fe80a9c90d65eb0039103fe565776250881731f06349f", size = 202625, upload-time = "2025-10-08T19:48:06.213Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/02/98ec20ff5546f68d673df2f7a69e8c0d076b5abd05ca882dc7ee3a83653d/propcache-0.4.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5558992a00dfd54ccbc64a32726a3357ec93825a418a401f5cc67df0ac5d9e49", size = 204209, upload-time = "2025-10-08T19:48:08.432Z" },
-    { url = "https://files.pythonhosted.org/packages/a0/87/492694f76759b15f0467a2a93ab68d32859672b646aa8a04ce4864e7932d/propcache-0.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c9b822a577f560fbd9554812526831712c1436d2c046cedee4c3796d3543b144", size = 197797, upload-time = "2025-10-08T19:48:09.968Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/36/66367de3575db1d2d3f3d177432bd14ee577a39d3f5d1b3d5df8afe3b6e2/propcache-0.4.1-cp314-cp314-win32.whl", hash = "sha256:ab4c29b49d560fe48b696cdcb127dd36e0bc2472548f3bf56cc5cb3da2b2984f", size = 38140, upload-time = "2025-10-08T19:48:11.232Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/2a/a758b47de253636e1b8aef181c0b4f4f204bf0dd964914fb2af90a95b49b/propcache-0.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:5a103c3eb905fcea0ab98be99c3a9a5ab2de60228aa5aceedc614c0281cf6153", size = 41257, upload-time = "2025-10-08T19:48:12.707Z" },
-    { url = "https://files.pythonhosted.org/packages/34/5e/63bd5896c3fec12edcbd6f12508d4890d23c265df28c74b175e1ef9f4f3b/propcache-0.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:74c1fb26515153e482e00177a1ad654721bf9207da8a494a0c05e797ad27b992", size = 38097, upload-time = "2025-10-08T19:48:13.923Z" },
-    { url = "https://files.pythonhosted.org/packages/99/85/9ff785d787ccf9bbb3f3106f79884a130951436f58392000231b4c737c80/propcache-0.4.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:824e908bce90fb2743bd6b59db36eb4f45cd350a39637c9f73b1c1ea66f5b75f", size = 81455, upload-time = "2025-10-08T19:48:15.16Z" },
-    { url = "https://files.pythonhosted.org/packages/90/85/2431c10c8e7ddb1445c1f7c4b54d886e8ad20e3c6307e7218f05922cad67/propcache-0.4.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2b5e7db5328427c57c8e8831abda175421b709672f6cfc3d630c3b7e2146393", size = 46372, upload-time = "2025-10-08T19:48:16.424Z" },
-    { url = "https://files.pythonhosted.org/packages/01/20/b0972d902472da9bcb683fa595099911f4d2e86e5683bcc45de60dd05dc3/propcache-0.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6f6ff873ed40292cd4969ef5310179afd5db59fdf055897e282485043fc80ad0", size = 48411, upload-time = "2025-10-08T19:48:17.577Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/e3/7dc89f4f21e8f99bad3d5ddb3a3389afcf9da4ac69e3deb2dcdc96e74169/propcache-0.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49a2dc67c154db2c1463013594c458881a069fcf98940e61a0569016a583020a", size = 275712, upload-time = "2025-10-08T19:48:18.901Z" },
-    { url = "https://files.pythonhosted.org/packages/20/67/89800c8352489b21a8047c773067644e3897f02ecbbd610f4d46b7f08612/propcache-0.4.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:005f08e6a0529984491e37d8dbc3dd86f84bd78a8ceb5fa9a021f4c48d4984be", size = 273557, upload-time = "2025-10-08T19:48:20.762Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/a1/b52b055c766a54ce6d9c16d9aca0cad8059acd9637cdf8aa0222f4a026ef/propcache-0.4.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5c3310452e0d31390da9035c348633b43d7e7feb2e37be252be6da45abd1abcc", size = 280015, upload-time = "2025-10-08T19:48:22.592Z" },
-    { url = "https://files.pythonhosted.org/packages/48/c8/33cee30bd890672c63743049f3c9e4be087e6780906bfc3ec58528be59c1/propcache-0.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c3c70630930447f9ef1caac7728c8ad1c56bc5015338b20fed0d08ea2480b3a", size = 262880, upload-time = "2025-10-08T19:48:23.947Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/b1/8f08a143b204b418285c88b83d00edbd61afbc2c6415ffafc8905da7038b/propcache-0.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8e57061305815dfc910a3634dcf584f08168a8836e6999983569f51a8544cd89", size = 260938, upload-time = "2025-10-08T19:48:25.656Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/12/96e4664c82ca2f31e1c8dff86afb867348979eb78d3cb8546a680287a1e9/propcache-0.4.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:521a463429ef54143092c11a77e04056dd00636f72e8c45b70aaa3140d639726", size = 247641, upload-time = "2025-10-08T19:48:27.207Z" },
-    { url = "https://files.pythonhosted.org/packages/18/ed/e7a9cfca28133386ba52278136d42209d3125db08d0a6395f0cba0c0285c/propcache-0.4.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:120c964da3fdc75e3731aa392527136d4ad35868cc556fd09bb6d09172d9a367", size = 262510, upload-time = "2025-10-08T19:48:28.65Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/76/16d8bf65e8845dd62b4e2b57444ab81f07f40caa5652b8969b87ddcf2ef6/propcache-0.4.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d8f353eb14ee3441ee844ade4277d560cdd68288838673273b978e3d6d2c8f36", size = 263161, upload-time = "2025-10-08T19:48:30.133Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/70/c99e9edb5d91d5ad8a49fa3c1e8285ba64f1476782fed10ab251ff413ba1/propcache-0.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ab2943be7c652f09638800905ee1bab2c544e537edb57d527997a24c13dc1455", size = 257393, upload-time = "2025-10-08T19:48:31.567Z" },
-    { url = "https://files.pythonhosted.org/packages/08/02/87b25304249a35c0915d236575bc3574a323f60b47939a2262b77632a3ee/propcache-0.4.1-cp314-cp314t-win32.whl", hash = "sha256:05674a162469f31358c30bcaa8883cb7829fa3110bf9c0991fe27d7896c42d85", size = 42546, upload-time = "2025-10-08T19:48:32.872Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/ef/3c6ecf8b317aa982f309835e8f96987466123c6e596646d4e6a1dfcd080f/propcache-0.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:990f6b3e2a27d683cb7602ed6c86f15ee6b43b1194736f9baaeb93d0016633b1", size = 46259, upload-time = "2025-10-08T19:48:34.226Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/2d/346e946d4951f37eca1e4f55be0f0174c52cd70720f84029b02f296f4a38/propcache-0.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:ecef2343af4cc68e05131e45024ba34f6095821988a9d0a02aa7c73fcc448aa9", size = 40428, upload-time = "2025-10-08T19:48:35.441Z" },
     { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
 ]
 
@@ -5160,12 +5008,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/65/b628f8459bca4efbfae50d4bf3feaab803de9a160b9d5f3bd9295a33f0c2/psutil-7.2.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:35630d5af80d5d0d49cfc4d64c1c13838baf6717a13effb35869a5919b854cdf", size = 183201, upload-time = "2025-12-29T08:26:10.622Z" },
     { url = "https://files.pythonhosted.org/packages/fb/23/851cadc9764edcc18f0effe7d0bf69f727d4cf2442deb4a9f78d4e4f30f2/psutil-7.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:923f8653416604e356073e6e0bccbe7c09990acef442def2f5640dd0faa9689f", size = 139081, upload-time = "2025-12-29T08:26:12.483Z" },
     { url = "https://files.pythonhosted.org/packages/59/82/d63e8494ec5758029f31c6cb06d7d161175d8281e91d011a4a441c8a43b5/psutil-7.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cfbe6b40ca48019a51827f20d830887b3107a74a79b01ceb8cc8de4ccb17b672", size = 134767, upload-time = "2025-12-29T08:26:14.528Z" },
-    { url = "https://files.pythonhosted.org/packages/05/c2/5fb764bd61e40e1fe756a44bd4c21827228394c17414ade348e28f83cd79/psutil-7.2.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:494c513ccc53225ae23eec7fe6e1482f1b8a44674241b54561f755a898650679", size = 129716, upload-time = "2025-12-29T08:26:16.017Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/d2/935039c20e06f615d9ca6ca0ab756cf8408a19d298ffaa08666bc18dc805/psutil-7.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3fce5f92c22b00cdefd1645aa58ab4877a01679e901555067b1bd77039aa589f", size = 130133, upload-time = "2025-12-29T08:26:18.009Z" },
-    { url = "https://files.pythonhosted.org/packages/77/69/19f1eb0e01d24c2b3eacbc2f78d3b5add8a89bf0bb69465bc8d563cc33de/psutil-7.2.1-cp314-cp314t-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93f3f7b0bb07711b49626e7940d6fe52aa9940ad86e8f7e74842e73189712129", size = 181518, upload-time = "2025-12-29T08:26:20.241Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/6d/7e18b1b4fa13ad370787626c95887b027656ad4829c156bb6569d02f3262/psutil-7.2.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d34d2ca888208eea2b5c68186841336a7f5e0b990edec929be909353a202768a", size = 184348, upload-time = "2025-12-29T08:26:22.215Z" },
-    { url = "https://files.pythonhosted.org/packages/98/60/1672114392dd879586d60dd97896325df47d9a130ac7401318005aab28ec/psutil-7.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2ceae842a78d1603753561132d5ad1b2f8a7979cb0c283f5b52fb4e6e14b1a79", size = 140400, upload-time = "2025-12-29T08:26:23.993Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/7b/d0e9d4513c46e46897b46bcfc410d51fc65735837ea57a25170f298326e6/psutil-7.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:08a2f175e48a898c8eb8eace45ce01777f4785bc744c90aa2cc7f2fa5462a266", size = 135430, upload-time = "2025-12-29T08:26:25.999Z" },
     { url = "https://files.pythonhosted.org/packages/c5/cf/5180eb8c8bdf6a503c6919f1da28328bd1e6b3b1b5b9d5b01ae64f019616/psutil-7.2.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b2e953fcfaedcfbc952b44744f22d16575d3aa78eb4f51ae74165b4e96e55f42", size = 128137, upload-time = "2025-12-29T08:26:27.759Z" },
     { url = "https://files.pythonhosted.org/packages/c5/2c/78e4a789306a92ade5000da4f5de3255202c534acdadc3aac7b5458fadef/psutil-7.2.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:05cc68dbb8c174828624062e73078e7e35406f4ca2d0866c272c2410d8ef06d1", size = 128947, upload-time = "2025-12-29T08:26:29.548Z" },
     { url = "https://files.pythonhosted.org/packages/29/f8/40e01c350ad9a2b3cb4e6adbcc8a83b17ee50dd5792102b6142385937db5/psutil-7.2.1-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5e38404ca2bb30ed7267a46c02f06ff842e92da3bb8c5bfdadbd35a5722314d8", size = 154694, upload-time = "2025-12-29T08:26:32.147Z" },
@@ -5205,25 +5047,15 @@ name = "pyarrow"
 version = "19.0.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7f/09/a9046344212690f0632b9c709f9bf18506522feb333c894d0de81d62341a/pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e", size = 1129437, upload-time = "2025-02-18T18:55:57.027Z" }
 wheels = [
@@ -5308,20 +5140,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" },
     { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" },
     { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" },
-    { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" },
-    { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" },
-    { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" },
-    { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" },
 ]
 
 [[package]]
@@ -5371,7 +5189,7 @@ wheels = [
 
 [package.optional-dependencies]
 email = [
-    { name = "email-validator", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "email-validator", marker = "python_full_version >= '3.11'" },
 ]
 
 [[package]]
@@ -5438,34 +5256,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
     { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
     { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" },
-    { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" },
-    { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" },
-    { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" },
-    { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" },
-    { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" },
-    { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" },
-    { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" },
-    { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" },
-    { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" },
-    { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
-    { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
     { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" },
     { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" },
     { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" },
@@ -5544,25 +5334,15 @@ name = "pygments"
 version = "2.20.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" }
 wheels = [
@@ -5580,7 +5360,7 @@ wheels = [
 
 [package.optional-dependencies]
 crypto = [
-    { name = "cryptography", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "cryptography", marker = "python_full_version >= '3.11'" },
 ]
 
 [[package]]
@@ -5641,18 +5421,6 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" },
-    { url = "https://files.pythonhosted.org/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" },
-    { url = "https://files.pythonhosted.org/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" },
-    { url = "https://files.pythonhosted.org/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" },
-    { url = "https://files.pythonhosted.org/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" },
-    { url = "https://files.pythonhosted.org/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" },
     { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" },
     { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" },
     { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" },
@@ -5684,7 +5452,7 @@ wheels = [
 
 [[package]]
 name = "pytest"
-version = "9.0.2"
+version = "9.0.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
@@ -5696,9 +5464,9 @@ dependencies = [
     { name = "pygments", version = "2.20.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "tomli", marker = "python_full_version < '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
 ]
 
 [[package]]
@@ -5750,16 +5518,12 @@ name = "python-magic"
 version = "0.4.24"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3a/70/76b185393fecf78f81c12f9dc7b1df814df785f6acb545fc92b016e75a7e/python-magic-0.4.24.tar.gz", hash = "sha256:de800df9fb50f8ec5974761054a708af6e4246b03b4bdaee993f948947b0ebcf", size = 17295, upload-time = "2021-06-03T13:49:24.116Z" }
 wheels = [
@@ -5771,11 +5535,9 @@ name = "python-magic"
 version = "0.4.27"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/da/db/0b3e28ac047452d079d375ec6798bf76a036a08182dbb39ed38116a49130/python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b", size = 14677, upload-time = "2022-06-07T20:16:59.508Z" }
 wheels = [
@@ -5784,11 +5546,11 @@ wheels = [
 
 [[package]]
 name = "python-multipart"
-version = "0.0.20"
+version = "0.0.29"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f3/87/f44d7c9f274c7ee665a29b885ec97089ec5dc034c7f3fafa03da9e39a09e/python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13", size = 37158, upload-time = "2024-12-16T19:45:46.972Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4e/fe/70bd71a6738b09a0bdf6480ca6436b167469ca4578b2a0efbe390b4b0e70/python_multipart-0.0.29.tar.gz", hash = "sha256:643e93849196645e2dbdd81a0f8829a23123ad7f797a84a364c6fb3563f18904", size = 45678, upload-time = "2026-05-17T17:29:47.654Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/45/58/38b5afbc1a800eeea951b9285d3912613f2603bdf897a4ab0f4bd7f405fc/python_multipart-0.0.20-py3-none-any.whl", hash = "sha256:8a62d3a8335e06589fe01f2a3e178cdcc632f3fbe0d492ad9ee0ec35aab1f104", size = 24546, upload-time = "2024-12-16T19:45:44.423Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/cb/769cfc37177252872a45a71f3fbdde9d51b471a3f3c14bfe95dde3407386/python_multipart-0.0.29-py3-none-any.whl", hash = "sha256:2ddcc971cef266225f54f552d8fa10bcfbb1f14446caec199060daac59ff2d69", size = 29640, upload-time = "2026-05-17T17:29:45.69Z" },
 ]
 
 [[package]]
@@ -5817,9 +5579,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" },
     { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" },
     { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" },
-    { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" },
 ]
 
 [[package]]
@@ -5866,24 +5625,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
     { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
     { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
-    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
-    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
-    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
-    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
-    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
 ]
 
 [[package]]
@@ -5999,17 +5740,14 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/95/898699cc1a6a5f304ea95376d079843b5c05f4c8c1ec7e55a5cc7ffcea50/ray-2.55.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:f9844a9272ef2e6eb5771025866072cf4234cf4c7cc1a31e235b7de7111864be", size = 65766823, upload-time = "2026-04-22T20:10:20.786Z" },
     { url = "https://files.pythonhosted.org/packages/c9/13/87deecc090c672e45a0cf6f5eef511de448b93f37ef18fd10eb8e8557a0d/ray-2.55.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:b415d590e062f248907e0fe42994943f11726b7178fcf4b1cf5546721fb1a5f8", size = 72818676, upload-time = "2026-04-22T20:10:26.705Z" },
     { url = "https://files.pythonhosted.org/packages/71/d7/fc95d3b8824c62105c64aa1b59c59600b581f608d78a2af753e010936dc9/ray-2.55.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:1380e043eb57cde69b7e9199c6f2558ceeb8f0fc41c97d1d5e50ea042115f302", size = 73678908, upload-time = "2026-04-22T20:10:32.795Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/03/7e552325572e067b23a4584bda8dc6a67af8bd7e03c424d2610bfa93112d/ray-2.55.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:b062045c64c2bce39a51661624f7292c7bbf30f2a9d878627aae31d46da5712d", size = 65774106, upload-time = "2026-04-22T20:10:39.885Z" },
-    { url = "https://files.pythonhosted.org/packages/94/62/607a8859520ce350861425f11f8e15d66c15ee33e6aac812f9e2889b5df4/ray-2.55.1-cp314-cp314-manylinux2014_aarch64.whl", hash = "sha256:4e618d61e1b14b6fde9a586151f3fd9d435b0b85048b997bcaa7f4a533747b2b", size = 72814044, upload-time = "2026-04-22T20:10:46.985Z" },
-    { url = "https://files.pythonhosted.org/packages/04/5a/0699bef04a72d7dc54462960d07ef7a19cd8b1e09979880aba2b6d13cca2/ray-2.55.1-cp314-cp314-manylinux2014_x86_64.whl", hash = "sha256:156ed3e72ad95b645d2006cd71a8dddbcc89b56bfc00027f6225adf78bd9cb74", size = 73644244, upload-time = "2026-04-22T20:10:52.973Z" },
 ]
 
 [package.optional-dependencies]
 data = [
     { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "pandas" },
     { name = "pyarrow", version = "19.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "pyarrow", version = "22.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
@@ -6123,34 +5861,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4f/f9/8bd6b656592f925b6845fcbb4d57603a3ac2fb2373344ffa1ed70aa6820a/regex-2025.11.3-cp313-cp313t-win32.whl", hash = "sha256:9ddc42e68114e161e51e272f667d640f97e84a2b9ef14b7477c53aac20c2d59a", size = 268792, upload-time = "2025-11-03T21:32:44.13Z" },
     { url = "https://files.pythonhosted.org/packages/e5/87/0e7d603467775ff65cd2aeabf1b5b50cc1c3708556a8b849a2fa4dd1542b/regex-2025.11.3-cp313-cp313t-win_amd64.whl", hash = "sha256:7a7c7fdf755032ffdd72c77e3d8096bdcb0eb92e89e17571a196f03d88b11b3c", size = 280214, upload-time = "2025-11-03T21:32:45.853Z" },
     { url = "https://files.pythonhosted.org/packages/8d/d0/2afc6f8e94e2b64bfb738a7c2b6387ac1699f09f032d363ed9447fd2bb57/regex-2025.11.3-cp313-cp313t-win_arm64.whl", hash = "sha256:df9eb838c44f570283712e7cff14c16329a9f0fb19ca492d21d4b7528ee6821e", size = 271469, upload-time = "2025-11-03T21:32:48.026Z" },
-    { url = "https://files.pythonhosted.org/packages/31/e9/f6e13de7e0983837f7b6d238ad9458800a874bf37c264f7923e63409944c/regex-2025.11.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9697a52e57576c83139d7c6f213d64485d3df5bf84807c35fa409e6c970801c6", size = 489089, upload-time = "2025-11-03T21:32:50.027Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/5c/261f4a262f1fa65141c1b74b255988bd2fa020cc599e53b080667d591cfc/regex-2025.11.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e18bc3f73bd41243c9b38a6d9f2366cd0e0137a9aebe2d8ff76c5b67d4c0a3f4", size = 291059, upload-time = "2025-11-03T21:32:51.682Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/57/f14eeb7f072b0e9a5a090d1712741fd8f214ec193dba773cf5410108bb7d/regex-2025.11.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:61a08bcb0ec14ff4e0ed2044aad948d0659604f824cbd50b55e30b0ec6f09c73", size = 288900, upload-time = "2025-11-03T21:32:53.569Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/6b/1d650c45e99a9b327586739d926a1cd4e94666b1bd4af90428b36af66dc7/regex-2025.11.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9c30003b9347c24bcc210958c5d167b9e4f9be786cb380a7d32f14f9b84674f", size = 799010, upload-time = "2025-11-03T21:32:55.222Z" },
-    { url = "https://files.pythonhosted.org/packages/99/ee/d66dcbc6b628ce4e3f7f0cbbb84603aa2fc0ffc878babc857726b8aab2e9/regex-2025.11.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4e1e592789704459900728d88d41a46fe3969b82ab62945560a31732ffc19a6d", size = 864893, upload-time = "2025-11-03T21:32:57.239Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/2d/f238229f1caba7ac87a6c4153d79947fb0261415827ae0f77c304260c7d3/regex-2025.11.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6538241f45eb5a25aa575dbba1069ad786f68a4f2773a29a2bd3dd1f9de787be", size = 911522, upload-time = "2025-11-03T21:32:59.274Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/3d/22a4eaba214a917c80e04f6025d26143690f0419511e0116508e24b11c9b/regex-2025.11.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce22519c989bb72a7e6b36a199384c53db7722fe669ba891da75907fe3587db", size = 803272, upload-time = "2025-11-03T21:33:01.393Z" },
-    { url = "https://files.pythonhosted.org/packages/84/b1/03188f634a409353a84b5ef49754b97dbcc0c0f6fd6c8ede505a8960a0a4/regex-2025.11.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:66d559b21d3640203ab9075797a55165d79017520685fb407b9234d72ab63c62", size = 787958, upload-time = "2025-11-03T21:33:03.379Z" },
-    { url = "https://files.pythonhosted.org/packages/99/6a/27d072f7fbf6fadd59c64d210305e1ff865cc3b78b526fd147db768c553b/regex-2025.11.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:669dcfb2e38f9e8c69507bace46f4889e3abbfd9b0c29719202883c0a603598f", size = 859289, upload-time = "2025-11-03T21:33:05.374Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/70/1b3878f648e0b6abe023172dacb02157e685564853cc363d9961bcccde4e/regex-2025.11.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:32f74f35ff0f25a5021373ac61442edcb150731fbaa28286bbc8bb1582c89d02", size = 850026, upload-time = "2025-11-03T21:33:07.131Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/d5/68e25559b526b8baab8e66839304ede68ff6727237a47727d240006bd0ff/regex-2025.11.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e6c7a21dffba883234baefe91bc3388e629779582038f75d2a5be918e250f0ed", size = 789499, upload-time = "2025-11-03T21:33:09.141Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/df/43971264857140a350910d4e33df725e8c94dd9dee8d2e4729fa0d63d49e/regex-2025.11.3-cp314-cp314-win32.whl", hash = "sha256:795ea137b1d809eb6836b43748b12634291c0ed55ad50a7d72d21edf1cd565c4", size = 271604, upload-time = "2025-11-03T21:33:10.9Z" },
-    { url = "https://files.pythonhosted.org/packages/01/6f/9711b57dc6894a55faf80a4c1b5aa4f8649805cb9c7aef46f7d27e2b9206/regex-2025.11.3-cp314-cp314-win_amd64.whl", hash = "sha256:9f95fbaa0ee1610ec0fc6b26668e9917a582ba80c52cc6d9ada15e30aa9ab9ad", size = 280320, upload-time = "2025-11-03T21:33:12.572Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/7e/f6eaa207d4377481f5e1775cdeb5a443b5a59b392d0065f3417d31d80f87/regex-2025.11.3-cp314-cp314-win_arm64.whl", hash = "sha256:dfec44d532be4c07088c3de2876130ff0fbeeacaa89a137decbbb5f665855a0f", size = 273372, upload-time = "2025-11-03T21:33:14.219Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/06/49b198550ee0f5e4184271cee87ba4dfd9692c91ec55289e6282f0f86ccf/regex-2025.11.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ba0d8a5d7f04f73ee7d01d974d47c5834f8a1b0224390e4fe7c12a3a92a78ecc", size = 491985, upload-time = "2025-11-03T21:33:16.555Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/bf/abdafade008f0b1c9da10d934034cb670432d6cf6cbe38bbb53a1cfd6cf8/regex-2025.11.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:442d86cf1cfe4faabf97db7d901ef58347efd004934da045c745e7b5bd57ac49", size = 292669, upload-time = "2025-11-03T21:33:18.32Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/ef/0c357bb8edbd2ad8e273fcb9e1761bc37b8acbc6e1be050bebd6475f19c1/regex-2025.11.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:fd0a5e563c756de210bb964789b5abe4f114dacae9104a47e1a649b910361536", size = 291030, upload-time = "2025-11-03T21:33:20.048Z" },
-    { url = "https://files.pythonhosted.org/packages/79/06/edbb67257596649b8fb088d6aeacbcb248ac195714b18a65e018bf4c0b50/regex-2025.11.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf3490bcbb985a1ae97b2ce9ad1c0f06a852d5b19dde9b07bdf25bf224248c95", size = 807674, upload-time = "2025-11-03T21:33:21.797Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/d9/ad4deccfce0ea336296bd087f1a191543bb99ee1c53093dcd4c64d951d00/regex-2025.11.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3809988f0a8b8c9dcc0f92478d6501fac7200b9ec56aecf0ec21f4a2ec4b6009", size = 873451, upload-time = "2025-11-03T21:33:23.741Z" },
-    { url = "https://files.pythonhosted.org/packages/13/75/a55a4724c56ef13e3e04acaab29df26582f6978c000ac9cd6810ad1f341f/regex-2025.11.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f4ff94e58e84aedb9c9fce66d4ef9f27a190285b451420f297c9a09f2b9abee9", size = 914980, upload-time = "2025-11-03T21:33:25.999Z" },
-    { url = "https://files.pythonhosted.org/packages/67/1e/a1657ee15bd9116f70d4a530c736983eed997b361e20ecd8f5ca3759d5c5/regex-2025.11.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eb542fd347ce61e1321b0a6b945d5701528dca0cd9759c2e3bb8bd57e47964d", size = 812852, upload-time = "2025-11-03T21:33:27.852Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/6f/f7516dde5506a588a561d296b2d0044839de06035bb486b326065b4c101e/regex-2025.11.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2d5919075a1f2e413c00b056ea0c2f065b3f5fe83c3d07d325ab92dce51d6", size = 795566, upload-time = "2025-11-03T21:33:32.364Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/dd/3d10b9e170cc16fb34cb2cef91513cf3df65f440b3366030631b2984a264/regex-2025.11.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:3f8bf11a4827cc7ce5a53d4ef6cddd5ad25595d3c1435ef08f76825851343154", size = 868463, upload-time = "2025-11-03T21:33:34.459Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/8e/935e6beff1695aa9085ff83195daccd72acc82c81793df480f34569330de/regex-2025.11.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:22c12d837298651e5550ac1d964e4ff57c3f56965fc1812c90c9fb2028eaf267", size = 854694, upload-time = "2025-11-03T21:33:36.793Z" },
-    { url = "https://files.pythonhosted.org/packages/92/12/10650181a040978b2f5720a6a74d44f841371a3d984c2083fc1752e4acf6/regex-2025.11.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ba394a3dda9ad41c7c780f60f6e4a70988741415ae96f6d1bf6c239cf01379", size = 799691, upload-time = "2025-11-03T21:33:39.079Z" },
-    { url = "https://files.pythonhosted.org/packages/67/90/8f37138181c9a7690e7e4cb388debbd389342db3c7381d636d2875940752/regex-2025.11.3-cp314-cp314t-win32.whl", hash = "sha256:4bf146dca15cdd53224a1bf46d628bd7590e4a07fbb69e720d561aea43a32b38", size = 274583, upload-time = "2025-11-03T21:33:41.302Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/cd/867f5ec442d56beb56f5f854f40abcfc75e11d10b11fdb1869dd39c63aaf/regex-2025.11.3-cp314-cp314t-win_amd64.whl", hash = "sha256:adad1a1bcf1c9e76346e091d22d23ac54ef28e1365117d99521631078dfec9de", size = 284286, upload-time = "2025-11-03T21:33:43.324Z" },
-    { url = "https://files.pythonhosted.org/packages/20/31/32c0c4610cbc070362bf1d2e4ea86d1ea29014d400a6d6c2486fcfd57766/regex-2025.11.3-cp314-cp314t-win_arm64.whl", hash = "sha256:c54f768482cef41e219720013cd05933b6f971d9562544d691c68699bf2b6801", size = 274741, upload-time = "2025-11-03T21:33:45.557Z" },
 ]
 
 [[package]]
@@ -6177,25 +5887,15 @@ name = "requests"
 version = "2.33.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "certifi", marker = "python_full_version >= '3.11'" },
@@ -6352,35 +6052,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" },
     { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" },
     { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" },
-    { url = "https://files.pythonhosted.org/packages/86/81/dad16382ebbd3d0e0328776d8fd7ca94220e4fa0798d1dc5e7da48cb3201/rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0", size = 362099, upload-time = "2025-11-30T20:23:27.316Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be", size = 353192, upload-time = "2025-11-30T20:23:29.151Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/c4/76eb0e1e72d1a9c4703c69607cec123c29028bff28ce41588792417098ac/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f", size = 384080, upload-time = "2025-11-30T20:23:30.785Z" },
-    { url = "https://files.pythonhosted.org/packages/72/87/87ea665e92f3298d1b26d78814721dc39ed8d2c74b86e83348d6b48a6f31/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f", size = 394841, upload-time = "2025-11-30T20:23:32.209Z" },
-    { url = "https://files.pythonhosted.org/packages/77/ad/7783a89ca0587c15dcbf139b4a8364a872a25f861bdb88ed99f9b0dec985/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87", size = 516670, upload-time = "2025-11-30T20:23:33.742Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/3c/2882bdac942bd2172f3da574eab16f309ae10a3925644e969536553cb4ee/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18", size = 408005, upload-time = "2025-11-30T20:23:35.253Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/81/9a91c0111ce1758c92516a3e44776920b579d9a7c09b2b06b642d4de3f0f/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad", size = 382112, upload-time = "2025-11-30T20:23:36.842Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/8e/1da49d4a107027e5fbc64daeab96a0706361a2918da10cb41769244b805d/rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07", size = 399049, upload-time = "2025-11-30T20:23:38.343Z" },
-    { url = "https://files.pythonhosted.org/packages/df/5a/7ee239b1aa48a127570ec03becbb29c9d5a9eb092febbd1699d567cae859/rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f", size = 415661, upload-time = "2025-11-30T20:23:40.263Z" },
-    { url = "https://files.pythonhosted.org/packages/70/ea/caa143cf6b772f823bc7929a45da1fa83569ee49b11d18d0ada7f5ee6fd6/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65", size = 565606, upload-time = "2025-11-30T20:23:42.186Z" },
-    { url = "https://files.pythonhosted.org/packages/64/91/ac20ba2d69303f961ad8cf55bf7dbdb4763f627291ba3d0d7d67333cced9/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f", size = 591126, upload-time = "2025-11-30T20:23:44.086Z" },
-    { url = "https://files.pythonhosted.org/packages/21/20/7ff5f3c8b00c8a95f75985128c26ba44503fb35b8e0259d812766ea966c7/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53", size = 553371, upload-time = "2025-11-30T20:23:46.004Z" },
-    { url = "https://files.pythonhosted.org/packages/72/c7/81dadd7b27c8ee391c132a6b192111ca58d866577ce2d9b0ca157552cce0/rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed", size = 215298, upload-time = "2025-11-30T20:23:47.696Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/d2/1aaac33287e8cfb07aab2e6b8ac1deca62f6f65411344f1433c55e6f3eb8/rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950", size = 228604, upload-time = "2025-11-30T20:23:49.501Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/95/ab005315818cc519ad074cb7784dae60d939163108bd2b394e60dc7b5461/rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6", size = 222391, upload-time = "2025-11-30T20:23:50.96Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/68/154fe0194d83b973cdedcdcc88947a2752411165930182ae41d983dcefa6/rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb", size = 364868, upload-time = "2025-11-30T20:23:52.494Z" },
-    { url = "https://files.pythonhosted.org/packages/83/69/8bbc8b07ec854d92a8b75668c24d2abcb1719ebf890f5604c61c9369a16f/rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8", size = 353747, upload-time = "2025-11-30T20:23:54.036Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/00/ba2e50183dbd9abcce9497fa5149c62b4ff3e22d338a30d690f9af970561/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7", size = 383795, upload-time = "2025-11-30T20:23:55.556Z" },
-    { url = "https://files.pythonhosted.org/packages/05/6f/86f0272b84926bcb0e4c972262f54223e8ecc556b3224d281e6598fc9268/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898", size = 393330, upload-time = "2025-11-30T20:23:57.033Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/e9/0e02bb2e6dc63d212641da45df2b0bf29699d01715913e0d0f017ee29438/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e", size = 518194, upload-time = "2025-11-30T20:23:58.637Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/ca/be7bca14cf21513bdf9c0606aba17d1f389ea2b6987035eb4f62bd923f25/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419", size = 408340, upload-time = "2025-11-30T20:24:00.2Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/c7/736e00ebf39ed81d75544c0da6ef7b0998f8201b369acf842f9a90dc8fce/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551", size = 383765, upload-time = "2025-11-30T20:24:01.759Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/3f/da50dfde9956aaf365c4adc9533b100008ed31aea635f2b8d7b627e25b49/rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8", size = 396834, upload-time = "2025-11-30T20:24:03.687Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/00/34bcc2565b6020eab2623349efbdec810676ad571995911f1abdae62a3a0/rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5", size = 415470, upload-time = "2025-11-30T20:24:05.232Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/28/882e72b5b3e6f718d5453bd4d0d9cf8df36fddeb4ddbbab17869d5868616/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404", size = 565630, upload-time = "2025-11-30T20:24:06.878Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/97/04a65539c17692de5b85c6e293520fd01317fd878ea1995f0367d4532fb1/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856", size = 591148, upload-time = "2025-11-30T20:24:08.445Z" },
-    { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" },
-    { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" },
     { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" },
     { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" },
     { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" },
@@ -6442,25 +6113,15 @@ name = "ruff"
 version = "0.15.12"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/99/43/3291f1cc9106f4c63bdce7a8d0df5047fe8422a75b091c16b5e9355e0b11/ruff-0.15.12.tar.gz", hash = "sha256:ecea26adb26b4232c0c2ca19ccbc0083a68344180bba2a600605538ce51a40a6", size = 4643852, upload-time = "2026-04-24T18:17:14.305Z" }
 wheels = [
@@ -6488,25 +6149,15 @@ name = "s3fs"
 version = "2025.3.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "aiobotocore", marker = "python_full_version >= '3.11'" },
@@ -6536,6 +6187,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2d/fc/56cba14af8ad8fd020c85b6e44328520ac55939bb1f9d01444ad470504cb/s3fs-2025.10.0-py3-none-any.whl", hash = "sha256:da7ef25efc1541f5fca8e1116361e49ea1081f83f4e8001fbd77347c625da28a", size = 30357, upload-time = "2025-10-30T15:06:03.48Z" },
 ]
 
+[[package]]
+name = "s3transfer"
+version = "0.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ca/bb/940d6af975948c1cc18f44545ffb219d3c35d78ec972b42ae229e8e37e08/s3transfer-0.15.0.tar.gz", hash = "sha256:d36fac8d0e3603eff9b5bfa4282c7ce6feb0301a633566153cbd0b93d11d8379", size = 152185, upload-time = "2025-11-20T20:28:56.327Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/e1/5ef25f52973aa12a19cf4e1375d00932d7fb354ffd310487ba7d44225c1a/s3transfer-0.15.0-py3-none-any.whl", hash = "sha256:6f8bf5caa31a0865c4081186689db1b2534cef721d104eb26101de4b9d6a5852", size = 85984, upload-time = "2025-11-20T20:28:55.046Z" },
+]
+
 [[package]]
 name = "sacrebleu"
 version = "2.6.0"
@@ -6543,8 +6206,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "python_full_version >= '3.11'" },
     { name = "lxml", marker = "python_full_version >= '3.11'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "portalocker", marker = "python_full_version >= '3.11'" },
     { name = "regex", marker = "python_full_version >= '3.11'" },
     { name = "tabulate", marker = "python_full_version >= '3.11'" },
@@ -6586,8 +6249,8 @@ version = "1.7.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "joblib", marker = "python_full_version >= '3.11'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "scipy", marker = "python_full_version >= '3.11'" },
     { name = "threadpoolctl", marker = "python_full_version >= '3.11'" },
 ]
@@ -6618,11 +6281,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/83/87/066cafc896ee540c34becf95d30375fe5cbe93c3b75a0ee9aa852cd60021/scikit_learn-1.7.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98335fb98509b73385b3ab2bd0639b1f610541d3988ee675c670371d6a87aa7c", size = 9527094, upload-time = "2025-09-09T08:21:11.486Z" },
     { url = "https://files.pythonhosted.org/packages/9c/2b/4903e1ccafa1f6453b1ab78413938c8800633988c838aa0be386cbb33072/scikit_learn-1.7.2-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:191e5550980d45449126e23ed1d5e9e24b2c68329ee1f691a3987476e115e09c", size = 9367436, upload-time = "2025-09-09T08:21:13.602Z" },
     { url = "https://files.pythonhosted.org/packages/b5/aa/8444be3cfb10451617ff9d177b3c190288f4563e6c50ff02728be67ad094/scikit_learn-1.7.2-cp313-cp313t-win_amd64.whl", hash = "sha256:57dc4deb1d3762c75d685507fbd0bc17160144b2f2ba4ccea5dc285ab0d0e973", size = 9275749, upload-time = "2025-09-09T08:21:15.96Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/82/dee5acf66837852e8e68df6d8d3a6cb22d3df997b733b032f513d95205b7/scikit_learn-1.7.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fa8f63940e29c82d1e67a45d5297bdebbcb585f5a5a50c4914cc2e852ab77f33", size = 9208906, upload-time = "2025-09-09T08:21:18.557Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/30/9029e54e17b87cb7d50d51a5926429c683d5b4c1732f0507a6c3bed9bf65/scikit_learn-1.7.2-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:f95dc55b7902b91331fa4e5845dd5bde0580c9cd9612b1b2791b7e80c3d32615", size = 8627836, upload-time = "2025-09-09T08:21:20.695Z" },
-    { url = "https://files.pythonhosted.org/packages/60/18/4a52c635c71b536879f4b971c2cedf32c35ee78f48367885ed8025d1f7ee/scikit_learn-1.7.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9656e4a53e54578ad10a434dc1f993330568cfee176dff07112b8785fb413106", size = 9426236, upload-time = "2025-09-09T08:21:22.645Z" },
-    { url = "https://files.pythonhosted.org/packages/99/7e/290362f6ab582128c53445458a5befd471ed1ea37953d5bcf80604619250/scikit_learn-1.7.2-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96dc05a854add0e50d3f47a1ef21a10a595016da5b007c7d9cd9d0bffd1fcc61", size = 9312593, upload-time = "2025-09-09T08:21:24.65Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/87/24f541b6d62b1794939ae6422f8023703bbf6900378b2b34e0b4384dfefd/scikit_learn-1.7.2-cp314-cp314-win_amd64.whl", hash = "sha256:bb24510ed3f9f61476181e4db51ce801e2ba37541def12dc9333b946fc7a9cf8", size = 8820007, upload-time = "2025-09-09T08:21:26.713Z" },
 ]
 
 [[package]]
@@ -6630,8 +6288,8 @@ name = "scipy"
 version = "1.17.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" }
 wheels = [
@@ -6675,26 +6333,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910, upload-time = "2026-02-23T00:20:34.743Z" },
     { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980, upload-time = "2026-02-23T00:20:40.575Z" },
     { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543, upload-time = "2026-02-23T00:20:45.313Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510, upload-time = "2026-02-23T00:21:01.015Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131, upload-time = "2026-02-23T00:21:05.888Z" },
-    { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032, upload-time = "2026-02-23T00:21:09.904Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766, upload-time = "2026-02-23T00:21:14.313Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007, upload-time = "2026-02-23T00:21:19.663Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333, upload-time = "2026-02-23T00:21:25.278Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066, upload-time = "2026-02-23T00:21:31.358Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763, upload-time = "2026-02-23T00:21:37.247Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984, upload-time = "2026-02-23T00:22:35.023Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877, upload-time = "2026-02-23T00:22:39.798Z" },
-    { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750, upload-time = "2026-02-23T00:21:42.289Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858, upload-time = "2026-02-23T00:21:47.706Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723, upload-time = "2026-02-23T00:21:52.039Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098, upload-time = "2026-02-23T00:21:56.185Z" },
-    { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397, upload-time = "2026-02-23T00:22:01.404Z" },
-    { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163, upload-time = "2026-02-23T00:22:07.024Z" },
-    { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291, upload-time = "2026-02-23T00:22:12.585Z" },
-    { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317, upload-time = "2026-02-23T00:22:18.513Z" },
-    { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327, upload-time = "2026-02-23T00:22:24.442Z" },
-    { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165, upload-time = "2026-02-23T00:22:29.563Z" },
 ]
 
 [[package]]
@@ -6703,8 +6341,8 @@ version = "5.4.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub", marker = "python_full_version >= '3.11'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "scikit-learn", marker = "python_full_version >= '3.11'" },
     { name = "scipy", marker = "python_full_version >= '3.11'" },
     { name = "torch", marker = "python_full_version >= '3.11'" },
@@ -6763,22 +6401,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/5e/ae66c361023a470afcbc1fbb8da722c72ea678a2fcd9a18f1a12598c7501/sentencepiece-0.2.1-cp313-cp313t-win32.whl", hash = "sha256:89a3ea015517c42c0341d0d962f3e6aaf2cf10d71b1932d475c44ba48d00aa2b", size = 1002501, upload-time = "2025-08-12T07:00:16.966Z" },
     { url = "https://files.pythonhosted.org/packages/c1/03/d332828c4ff764e16c1b56c2c8f9a33488bbe796b53fb6b9c4205ddbf167/sentencepiece-0.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:33f068c9382dc2e7c228eedfd8163b52baa86bb92f50d0488bf2b7da7032e484", size = 1057555, upload-time = "2025-08-12T07:00:18.573Z" },
     { url = "https://files.pythonhosted.org/packages/88/14/5aee0bf0864df9bd82bd59e7711362908e4935e3f9cdc1f57246b5d5c9b9/sentencepiece-0.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:b3616ad246f360e52c85781e47682d31abfb6554c779e42b65333d4b5f44ecc0", size = 1036042, upload-time = "2025-08-12T07:00:20.209Z" },
-    { url = "https://files.pythonhosted.org/packages/24/9c/89eb8b2052f720a612478baf11c8227dcf1dc28cd4ea4c0c19506b5af2a2/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5d0350b686c320068702116276cfb26c066dc7e65cfef173980b11bb4d606719", size = 1943147, upload-time = "2025-08-12T07:00:21.809Z" },
-    { url = "https://files.pythonhosted.org/packages/82/0b/a1432bc87f97c2ace36386ca23e8bd3b91fb40581b5e6148d24b24186419/sentencepiece-0.2.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c7f54a31cde6fa5cb030370566f68152a742f433f8d2be458463d06c208aef33", size = 1325624, upload-time = "2025-08-12T07:00:23.289Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/99/bbe054ebb5a5039457c590e0a4156ed073fb0fe9ce4f7523404dd5b37463/sentencepiece-0.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c83b85ab2d6576607f31df77ff86f28182be4a8de6d175d2c33ca609925f5da1", size = 1253670, upload-time = "2025-08-12T07:00:24.69Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ad/d5c7075f701bd97971d7c2ac2904f227566f51ef0838dfbdfdccb58cd212/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1855f57db07b51fb51ed6c9c452f570624d2b169b36f0f79ef71a6e6c618cd8b", size = 1316247, upload-time = "2025-08-12T07:00:26.435Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/03/35fbe5f3d9a7435eebd0b473e09584bd3cc354ce118b960445b060d33781/sentencepiece-0.2.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01e6912125cb45d3792f530a4d38f8e21bf884d6b4d4ade1b2de5cf7a8d2a52b", size = 1387894, upload-time = "2025-08-12T07:00:28.339Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/aa/956ef729aafb6c8f9c443104c9636489093bb5c61d6b90fc27aa1a865574/sentencepiece-0.2.1-cp314-cp314-win32.whl", hash = "sha256:c415c9de1447e0a74ae3fdb2e52f967cb544113a3a5ce3a194df185cbc1f962f", size = 1096698, upload-time = "2025-08-12T07:00:29.764Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/cb/fe400d8836952cc535c81a0ce47dc6875160e5fedb71d2d9ff0e9894c2a6/sentencepiece-0.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:881b2e44b14fc19feade3cbed314be37de639fc415375cefaa5bc81a4be137fd", size = 1155115, upload-time = "2025-08-12T07:00:32.865Z" },
-    { url = "https://files.pythonhosted.org/packages/32/89/047921cf70f36c7b6b6390876b2399b3633ab73b8d0cb857e5a964238941/sentencepiece-0.2.1-cp314-cp314-win_arm64.whl", hash = "sha256:2005242a16d2dc3ac5fe18aa7667549134d37854823df4c4db244752453b78a8", size = 1133890, upload-time = "2025-08-12T07:00:34.763Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/11/5b414b9fae6255b5fb1e22e2ed3dc3a72d3a694e5703910e640ac78346bb/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:a19adcec27c524cb7069a1c741060add95f942d1cbf7ad0d104dffa0a7d28a2b", size = 1946081, upload-time = "2025-08-12T07:00:36.97Z" },
-    { url = "https://files.pythonhosted.org/packages/77/eb/7a5682bb25824db8545f8e5662e7f3e32d72a508fdce086029d89695106b/sentencepiece-0.2.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:e37e4b4c4a11662b5db521def4e44d4d30ae69a1743241412a93ae40fdcab4bb", size = 1327406, upload-time = "2025-08-12T07:00:38.669Z" },
-    { url = "https://files.pythonhosted.org/packages/03/b0/811dae8fb9f2784e138785d481469788f2e0d0c109c5737372454415f55f/sentencepiece-0.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:477c81505db072b3ab627e7eab972ea1025331bd3a92bacbf798df2b75ea86ec", size = 1254846, upload-time = "2025-08-12T07:00:40.611Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/23/195b2e7ec85ebb6a547969f60b723c7aca5a75800ece6cc3f41da872d14e/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:010f025a544ef770bb395091d57cb94deb9652d8972e0d09f71d85d5a0816c8c", size = 1315721, upload-time = "2025-08-12T07:00:42.914Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/aa/553dbe4178b5f23eb28e59393dddd64186178b56b81d9b8d5c3ff1c28395/sentencepiece-0.2.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:733e59ff1794d26db706cd41fc2d7ca5f6c64a820709cb801dc0ea31780d64ab", size = 1387458, upload-time = "2025-08-12T07:00:44.56Z" },
-    { url = "https://files.pythonhosted.org/packages/66/7c/08ff0012507297a4dd74a5420fdc0eb9e3e80f4e88cab1538d7f28db303d/sentencepiece-0.2.1-cp314-cp314t-win32.whl", hash = "sha256:d3233770f78e637dc8b1fda2cd7c3b99ec77e7505041934188a4e7fe751de3b0", size = 1099765, upload-time = "2025-08-12T07:00:46.058Z" },
-    { url = "https://files.pythonhosted.org/packages/91/d5/2a69e1ce15881beb9ddfc7e3f998322f5cedcd5e4d244cb74dade9441663/sentencepiece-0.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5e4366c97b68218fd30ea72d70c525e6e78a6c0a88650f57ac4c43c63b234a9d", size = 1157807, upload-time = "2025-08-12T07:00:47.673Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/16/54f611fcfc2d1c46cbe3ec4169780b2cfa7cf63708ef2b71611136db7513/sentencepiece-0.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:105e36e75cbac1292642045458e8da677b2342dcd33df503e640f0b457cb6751", size = 1136264, upload-time = "2025-08-12T07:00:49.485Z" },
 ]
 
 [[package]]
@@ -6799,10 +6421,7 @@ name = "setuptools"
 version = "81.0.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0d/1c/73e719955c59b8e424d015ab450f51c0af856ae46ea2da83eba51cc88de1/setuptools-81.0.0.tar.gz", hash = "sha256:487b53915f52501f0a79ccfd0c02c165ffe06631443a886740b91af4b7a5845a", size = 1198299, upload-time = "2026-02-06T21:10:39.601Z" }
@@ -6815,15 +6434,13 @@ name = "setuptools"
 version = "82.0.1"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" }
 wheels = [
@@ -6839,6 +6456,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
 ]
 
+[[package]]
+name = "simple-parsing"
+version = "0.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "docstring-parser" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/be/67/e3e5b89f1c81ca574a157104b0ecebfc3096933cbf58f644c9cb0a56c94f/simple_parsing-0.1.8.tar.gz", hash = "sha256:19c2a9002ebd7ad281fce579f9b2a0aa0c4d67e1688cee0e8cdf6d8e98ec2c18", size = 255933, upload-time = "2026-01-20T23:29:05.258Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/70/46/eab9fe2a4a2f6665a7c79b2007121a00ba95502fef50c1537d8147b4f91c/simple_parsing-0.1.8-py3-none-any.whl", hash = "sha256:4d1ef136a28674b3ebb9760cacda4d6f01de32de0b280a869df977d182f12947", size = 113438, upload-time = "2026-01-20T23:29:04.17Z" },
+]
+
 [[package]]
 name = "six"
 version = "1.17.0"
@@ -6905,6 +6535,84 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679, upload-time = "2025-08-27T15:39:50.179Z" },
 ]
 
+[[package]]
+name = "spacy"
+version = "3.8.14"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "catalogue", marker = "python_full_version >= '3.11'" },
+    { name = "confection", marker = "python_full_version >= '3.11'" },
+    { name = "cymem", marker = "python_full_version >= '3.11'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "murmurhash", marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+    { name = "packaging", marker = "python_full_version >= '3.11'" },
+    { name = "preshed", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "setuptools", version = "81.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "82.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+    { name = "spacy-legacy", marker = "python_full_version >= '3.11'" },
+    { name = "spacy-loggers", marker = "python_full_version >= '3.11'" },
+    { name = "srsly", marker = "python_full_version >= '3.11'" },
+    { name = "thinc", marker = "python_full_version >= '3.11'" },
+    { name = "tqdm", marker = "python_full_version >= '3.11'" },
+    { name = "typer", marker = "python_full_version >= '3.11'" },
+    { name = "wasabi", marker = "python_full_version >= '3.11'" },
+    { name = "weasel", marker = "python_full_version >= '3.11'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/d5/9860449c9fbed97634fb974bbf7a128ae269f5e69f3d792a9679d2354141/spacy-3.8.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4a06e5cc029910eaa4a3c5a0a997f07fed6a41aba46b05da9f58643bc06fe8b9", size = 6625650, upload-time = "2026-03-29T10:40:07.13Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/06/f896dfd0ea78093ae8221a958cd7df85a8818d58ba25c1a574aa153724e2/spacy-3.8.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d1bdc25a8351023b42619513bac82c861a25f45bfa4476d7e9673b6f0a177d3", size = 6449875, upload-time = "2026-03-29T10:40:09.329Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/ec/2b5b12396eb4cdacda26af497d6e1a798e1eceba060c6bb63d2c599c4988/spacy-3.8.14-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:93b369785413772a7121c29cafbf71320623d2e9ade5881acf81b5e3e8f1b4ae", size = 30761626, upload-time = "2026-03-29T10:40:11.736Z" },
+    { url = "https://files.pythonhosted.org/packages/51/cb/8c103e81de0d9acafc30df06d0aba0d229a559154f4c7c889895d9f8f770/spacy-3.8.14-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6aa0b9308e9f4bb34ba8ff3b65df605f6d23891fe296f3e2f7b6879544637874", size = 30999811, upload-time = "2026-03-29T10:40:14.428Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/3a/99ff50b2b489ebd7ded9a5192dcf5e5003a3f6b785479ae1775f6e46a682/spacy-3.8.14-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cedabf5602ca1a4d5b14fc680bb7e5ac569139e1f3d780b261d373dc19f4c9d1", size = 31039385, upload-time = "2026-03-29T10:40:17.192Z" },
+    { url = "https://files.pythonhosted.org/packages/04/0f/c8f7da2aa0e58874c9c5555104ee83016396a7cf5b347d97d77c6d775107/spacy-3.8.14-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cc30623c581aa8f8e9763cb8625641d3fe96fa9fa95992ffb8544aaf6e3afcf3", size = 31882665, upload-time = "2026-03-29T10:40:20.232Z" },
+    { url = "https://files.pythonhosted.org/packages/16/a0/a990b30ba1a09228ee49270d996be51e0b2245350631b0e4fdd655d019f1/spacy-3.8.14-cp310-cp310-win_amd64.whl", hash = "sha256:161671338396eea2455f9b9dee8e37e00f00bc2f352f6f8cf73a4cd2b708545b", size = 15359609, upload-time = "2026-03-29T10:40:23.513Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/8c/0ccf32d9a6b4fd8737bba33d599ddb98934399c1d523f825a4beb4bd1495/spacy-3.8.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8c55cb123c3edfba8c252ce6ae27ffb3d7f60a53ba5e108c3534421586c5fdda", size = 6617470, upload-time = "2026-03-29T10:40:25.572Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/61/7f7d38e71daac7f91ffd362fb15645b6f9a68ad231e0ed6ff5c1dc6f6930/spacy-3.8.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ab6c1ace316338dac334fc93c849994bbd717f9ebf59d2bc4158e978b2f542ee", size = 6441524, upload-time = "2026-03-29T10:40:27.648Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/ef/18385aa5aeb9bcb299e8074da162b24e5c8bea5aa4d1dfa3dbafb35e9d1f/spacy-3.8.14-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7bece0450cd8ab841cfa8527fcc0ce18c4454f28e3b9fca42a450803a067355b", size = 32050591, upload-time = "2026-03-29T10:40:29.704Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/67/5c4a65ed2cedc598ad000a2b9f45afc76bb8d17a592cc01082dffa8bbc50/spacy-3.8.14-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc5e5f2ed121d57d819d247bb59253dc320a58acbd237b85f86c2aa38cab6bd1", size = 32296467, upload-time = "2026-03-29T10:40:32.557Z" },
+    { url = "https://files.pythonhosted.org/packages/03/a7/28c118879791b3a7ffa81796d22203daac428e6f75572f1b8da1539e1ac6/spacy-3.8.14-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c4e5bc5cdefe39ea139985776a2e8eae05e7ff2bf51ca1bd65247dc45feeb8e", size = 32288404, upload-time = "2026-03-29T10:40:35.583Z" },
+    { url = "https://files.pythonhosted.org/packages/72/1c/32aefcea2468782fcdb994f2f96cac93dc74f6589ce01047db42d9a299a2/spacy-3.8.14-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2c228f4c9ae618173334c17adb748d66b574b6594bc3575233e15cd5ad1cb26b", size = 33113476, upload-time = "2026-03-29T10:40:38.577Z" },
+    { url = "https://files.pythonhosted.org/packages/86/32/fc00532eabeace451175dd9b152ddd636e8f6a42248b5d90141f98be2af5/spacy-3.8.14-cp311-cp311-win_amd64.whl", hash = "sha256:6f51d1ce8b1ba30123f6bef6e795c4bc5466608e6e8a015dc828bd21d399aa9c", size = 15359704, upload-time = "2026-03-29T10:40:41.25Z" },
+    { url = "https://files.pythonhosted.org/packages/de/31/89ff6722ec91f328dc717932849c6f57249c8a9d429d8670a6c8f70e576b/spacy-3.8.14-cp311-cp311-win_arm64.whl", hash = "sha256:c0c6c9d8771cc3708e309b07310d330fc8443a6bca34f4ff20b0f22751d8faf9", size = 14717168, upload-time = "2026-03-29T10:40:43.916Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/78/e4f2ae19a791cae756cd0e801204953eaec4e9ab75a60ad39f671dbb8d5a/spacy-3.8.14-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:726f02c60a2c6b0029167370d22d51731172a053d29c7e2ea6190db6de3ab483", size = 6218335, upload-time = "2026-03-29T10:40:46.298Z" },
+    { url = "https://files.pythonhosted.org/packages/06/df/178bbab47fa209c8baf2f1e609cbddc6b18a985200be1ceee22bd5b89beb/spacy-3.8.14-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e3ebe50b93f2d40e8ec3451255528bb622ccb12be39fd140bb87668ce8d1075b", size = 6033860, upload-time = "2026-03-29T10:40:47.861Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/e8/048d83b73b28686307bd9a60878a58de7b7b21b562ca4de8b5bd558031e9/spacy-3.8.14-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:daeb64b048f12c059997281aed53eb8776d26416dd313cf17ad6f63124b2b564", size = 32725099, upload-time = "2026-03-29T10:40:50.194Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/3f/1799af5f4ccc8eb7500e4a20ca301488134429dba08cda5be68ce6ab2992/spacy-3.8.14-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6d45715a24446f23b98ec3f09409a1d4111983d1d64613250ee38c3270e21853", size = 33205838, upload-time = "2026-03-29T10:40:53.029Z" },
+    { url = "https://files.pythonhosted.org/packages/78/07/81ab9acd0ec64bfdd7339acfc4cf35f5fb74bbbb0b2be7e64d717c416bac/spacy-3.8.14-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1069a8be34940809f8462eb69f09a3f0ce59bf8b9cb82475f2a8e3580f50ece0", size = 32090380, upload-time = "2026-03-29T10:40:56.115Z" },
+    { url = "https://files.pythonhosted.org/packages/74/a5/b081b5bd3cedb2634c23eb470b5e24c65c894c57646567f47627291c2b3f/spacy-3.8.14-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2dfa77aec7fdebac0455d8afd4ce1d92d6f868b03d507ed1976179a63db7b374", size = 32991946, upload-time = "2026-03-29T10:40:58.852Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/55/4371413a6dfc1fa837282a365498165f828c2f3fe018dfb35336acc869e0/spacy-3.8.14-cp312-cp312-win_amd64.whl", hash = "sha256:9def18c76a4472b326cb91a195623c9ca38a2b86999ad2df9e00b49ba8c63734", size = 14226946, upload-time = "2026-03-29T10:41:01.63Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/5e/12ac876017da6c1e6b72afcc3c8b309996227fd3aa15382cd3311aee21b8/spacy-3.8.14-cp312-cp312-win_arm64.whl", hash = "sha256:d6257133357e4801c9c5d011925af5439b0a015aacf3c16528aa0009982431c7", size = 13628765, upload-time = "2026-03-29T10:41:03.806Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/e5/822bbdfa459fee863ef2e9879a34b0ae5db7cd1e3eb76d32c766f19222e9/spacy-3.8.14-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b4f60fa8b9641a5e93e7a96db0cdd106d05d61756bf1d0ddcd1705ad347909a", size = 6202114, upload-time = "2026-03-29T10:41:06.119Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/de/0e512154113e1f341567f2b9341835775e4180c180221e60faedaebb2f65/spacy-3.8.14-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0860c57220c633ccb20468bcd64bfb0d28908990c371a8857951d093a148dc8e", size = 6015458, upload-time = "2026-03-29T10:41:07.79Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/4f/29c7e56afc7db07348a9e0efe0243b5eef465d5dc3d56433f164378c3fa6/spacy-3.8.14-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c24620b7dba879c69cebc51ef3b1107d4d4e44a1e0d4baa439372887d00c3fd9", size = 32510659, upload-time = "2026-03-29T10:41:09.88Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/ce/cae678f664d5467016819253f5d6e52f8e68a12d8e799b651d73ec2a9a4b/spacy-3.8.14-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9699c1248d115d5825987c287a6f6acd66386ef3ebee7994ee67ba093e932c59", size = 32841057, upload-time = "2026-03-29T10:41:12.585Z" },
+    { url = "https://files.pythonhosted.org/packages/04/d4/419868afd449bdd367df005932537eea66c71e97c899ba278f3124933f3c/spacy-3.8.14-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:042d799e342fdb6bb5b02a4213a95acc9116c40ed3c849bb0a8296fbe648ec22", size = 31763252, upload-time = "2026-03-29T10:41:15.569Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/53/df5c1fee45f200b749ba72eeb536fbb2c545fc56230324954263b2f3be00/spacy-3.8.14-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:69b2264294097336e86832e8663f1ab3a7215621184863c96c082ab17ee11937", size = 32717872, upload-time = "2026-03-29T10:41:18.193Z" },
+    { url = "https://files.pythonhosted.org/packages/12/c2/f1882ec2f5cc9c4e73cf2132997a03c397d7ceeb5ee7f7bb878b51a16365/spacy-3.8.14-cp313-cp313-win_amd64.whl", hash = "sha256:4b6d4f20e291a7c70e37de2f246622b44a0ce82efaa710c9801c6bd599e75177", size = 14220335, upload-time = "2026-03-29T10:41:20.89Z" },
+]
+
+[[package]]
+name = "spacy-legacy"
+version = "3.0.12"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d9/79/91f9d7cc8db5642acad830dcc4b49ba65a7790152832c4eceb305e46d681/spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774", size = 23806, upload-time = "2023-01-23T09:04:15.104Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f", size = 29971, upload-time = "2023-01-23T09:04:13.45Z" },
+]
+
+[[package]]
+name = "spacy-loggers"
+version = "1.0.5"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/67/3d/926db774c9c98acf66cb4ed7faf6c377746f3e00b84b700d0868b95d0712/spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24", size = 20811, upload-time = "2023-09-11T12:26:52.323Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645", size = 22343, upload-time = "2023-09-11T12:26:50.586Z" },
+]
+
 [[package]]
 name = "sphinx"
 version = "8.1.3"
@@ -6942,25 +6650,15 @@ name = "sphinx"
 version = "8.2.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "alabaster", marker = "python_full_version >= '3.11'" },
@@ -7012,25 +6710,15 @@ name = "sphinx-autobuild"
 version = "2025.8.25"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and platform_machine == 'x86_64' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and sys_platform != 'linux'",
-    "(python_full_version == '3.13.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.13.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.13.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.13.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.12.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.12.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
-    "(python_full_version == '3.11.*' and platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform != 'linux') or (python_full_version == '3.11.*' and sys_platform == 'darwin')",
-    "(python_full_version == '3.11.*' and platform_machine == 'aarch64' and sys_platform != 'darwin' and sys_platform != 'linux') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform != 'darwin' and sys_platform != 'linux')",
+    "python_full_version >= '3.13' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version >= '3.13' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and platform_machine != 'x86_64' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "colorama", marker = "python_full_version >= '3.11'" },
@@ -7158,31 +6846,74 @@ name = "sqlfluff"
 version = "3.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "chardet", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "click", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "colorama", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "diff-cover", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "jinja2", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pathspec", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "platformdirs", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pytest", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "pyyaml", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "regex", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "tblib", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "tqdm", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "chardet", marker = "python_full_version >= '3.11'" },
+    { name = "click", marker = "python_full_version >= '3.11'" },
+    { name = "colorama", marker = "python_full_version >= '3.11'" },
+    { name = "diff-cover", marker = "python_full_version >= '3.11'" },
+    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "pathspec", marker = "python_full_version >= '3.11'" },
+    { name = "platformdirs", marker = "python_full_version >= '3.11'" },
+    { name = "pytest", marker = "python_full_version >= '3.11'" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
+    { name = "regex", marker = "python_full_version >= '3.11'" },
+    { name = "tblib", marker = "python_full_version >= '3.11'" },
+    { name = "tqdm", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4c/a8/d3dc6c510cc3bba9abbf7a3052a96d5ce6771b71dda141846003fa37277a/sqlfluff-3.5.0.tar.gz", hash = "sha256:2d0a546078ffb021de7021b9a6c2a50e5eef590daa820d5f1b082d24a1d5e1d4", size = 921199, upload-time = "2025-10-18T19:33:07.778Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/47/d5/83c3eacdd6c3249fb5f8a0b5612ab10b661862e0df869951f45fd837448d/sqlfluff-3.5.0-py3-none-any.whl", hash = "sha256:6e5fb7a0c491676ded68912245fc0627e88f8b0e6290bd4b54a65ce735f69716", size = 921597, upload-time = "2025-10-18T19:33:05.839Z" },
 ]
 
+[[package]]
+name = "srsly"
+version = "2.5.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "catalogue", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2b/db/f794f219a6c788b881252d2536a8c4a97d2bdaadc690391e1cb53d123d71/srsly-2.5.3.tar.gz", hash = "sha256:08f98dbecbff3a31466c4ae7c833131f59d3655a0ad8ac749e6e2c149e2b0680", size = 490881, upload-time = "2026-03-23T11:56:59.865Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/67/e6d4decfb0cdc95b54c60854a1a6d1702983c39206c2b9f70f4ab18b17c8/srsly-2.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c812302a9acfe171e82f680b7ad642014cd017380b2c678441b3da4fb513c498", size = 657202, upload-time = "2026-03-23T11:55:34.938Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/5d/cb8b093d0836e59c152de6dfdb5db80c6408b00def0123f26d24bffde480/srsly-2.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91688edb1f49110870d2c215db2cf445f1763c14173698ead0818908c51fb2a1", size = 657951, upload-time = "2026-03-23T11:55:36.571Z" },
+    { url = "https://files.pythonhosted.org/packages/71/a1/5d2fb4c6a8e0e39dd1fb23bdd8feb1f2525ce90b28946f9f58ac5d3a039c/srsly-2.5.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1fd6c35c65c4d2435ae5bfb57b59682cf9b61606318a2a761856be9d7cc2d9e3", size = 1119766, upload-time = "2026-03-23T11:55:38.351Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/83/0862ffac8c06ed595dd1e28f261c37956585b9cf6b9bd049f8430a4c2daf/srsly-2.5.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b9df76d5a6bbf50967589bd42df3c522dd88babea2be745a507f56b41ab40626", size = 1120674, upload-time = "2026-03-23T11:55:39.644Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/06/42f72bab50876a708a10e6fc026ae8c7f185507d9f27544fa4ee8567c5fd/srsly-2.5.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a595958d0b1ff6d59c2570a3f0d1c8e36ab9f89d6e1b9c96fa7eb5e1a8698510", size = 1078505, upload-time = "2026-03-23T11:55:41.299Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/f4/dfb86bc5c3abee267fb2f34895ea80d0159a084987a93d56ed1bf5ebefe4/srsly-2.5.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bc0ad5be2aeb9ff29c8512848d39d7c63fdd4bfbb5516bc523f5de5a77e55e6d", size = 1090635, upload-time = "2026-03-23T11:55:42.7Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/a6/561b46eff4477191dd649e09dd9b88afc44aad7ce204c45f4e45ad04861d/srsly-2.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:d2b8cfd8aee4d06ab335d359e4095d206102300a5e105a4b4bc69acca42427a6", size = 651653, upload-time = "2026-03-23T11:55:44.429Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/05/b122a1afaf8e8644d10f0203ad5174993910e6f727843089f0d48b444340/srsly-2.5.3-cp310-cp310-win_arm64.whl", hash = "sha256:c378afcb7dd7c42f426a66112496c949fc39e5883de6817d86e60afa51720ccc", size = 639118, upload-time = "2026-03-23T11:55:45.796Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/36/5d7bb412d52e9cca787f9bfe838b596367189b254e50bf90f234a97184bf/srsly-2.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:785a09216ac31570fb301ddb9f61ee73d1f18f8b9561f712dce0b8ac8628bc88", size = 656760, upload-time = "2026-03-23T11:55:47.155Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/dc/124f008cd2be3e887e972cbdeb17c5aee0f42093eca02c7cfd63bb5daf19/srsly-2.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0017c7d2a0cd9a4f1bdc00d946b45edcf90bb0e271e8f084c1ce542bf6708c32", size = 657503, upload-time = "2026-03-23T11:55:48.681Z" },
+    { url = "https://files.pythonhosted.org/packages/35/8a/2c97244ebab125d55f1bfb7bb94e9572b3e819410dffd6a040eca1112350/srsly-2.5.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:66ebae2c70305987341519ec1a720072a3cb3e4b1d52ac0e9e841f4d02658d3d", size = 1139161, upload-time = "2026-03-23T11:55:50.179Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/ea/ecd396188f7591d80b89665f7af9e3ae02e42683daef57033ad7993ad3f9/srsly-2.5.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4ca4a068f6e14d84113a02fcb875c6b50a6285a12938c0e7a157eb3a63c50a86", size = 1142438, upload-time = "2026-03-23T11:55:52.607Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/65/143e2e143c53d498ad0956f69d0e09189aa7a6e0ee6017758c285ba1ab2d/srsly-2.5.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e283fa2a8f7350fb9fb70ecdee28d59d39c92f4c7f1cc90a44d6b86db3b3a8b3", size = 1101783, upload-time = "2026-03-23T11:55:53.906Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/86/1392a5593de0cd3d08c2d6c071b877c84358a37f63172c4e9cb71706842d/srsly-2.5.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9ffc97e22730ea97b00f7c303ccc60b1305e786afadb2a4a46578dafa4d29da0", size = 1115876, upload-time = "2026-03-23T11:55:55.624Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/a5/6193aa4c08e488821538fcbce2282449e228fd2183ed67d118bb5ccd8b54/srsly-2.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:f09b551f6c3e334652831ac68c770ee4284741ce0a3895bf1ccf2a1178d66cdd", size = 651733, upload-time = "2026-03-23T11:55:56.964Z" },
+    { url = "https://files.pythonhosted.org/packages/66/a8/a73181743b6d237026615ca75c3fb3e4780736f1390550a7350d0c7f1149/srsly-2.5.3-cp311-cp311-win_arm64.whl", hash = "sha256:21cf09e417d3e4f3fbf7dd337fd6d948c97abd01896b9b4cb80e81cd9778a73a", size = 639124, upload-time = "2026-03-23T11:55:58.532Z" },
+    { url = "https://files.pythonhosted.org/packages/02/cc/e9f7fcec4cc92ad8bad6316c4241638b8cf7380382d4489d94ec6c436452/srsly-2.5.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:71e51c046ccbeefb86524c6b1e17574f579c6ac4dc8ea4a09437d3e8f88342d3", size = 658379, upload-time = "2026-03-23T11:55:59.85Z" },
+    { url = "https://files.pythonhosted.org/packages/21/e4/fea4512e9785f58509b2cf67d993323848e583161b5fcfdc7dd9d7c1f3df/srsly-2.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f73c0db911552e94fe2016e1759d261d2f47926f68826664cada3723c87006a", size = 658513, upload-time = "2026-03-23T11:56:01.239Z" },
+    { url = "https://files.pythonhosted.org/packages/20/b1/53591681b6ff2699a4f97b2d5552ba196eaa6a979b0873605f4c04b5f7ee/srsly-2.5.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c1ac27ae5f4bb9163c7d2c45fc8ec173aac3d92e32086d9472b326c5c6e570e", size = 1172265, upload-time = "2026-03-23T11:56:02.589Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/c9/741e29f534919a944a16da4184924b1d3404c4bf60716ab2b91be771d1e3/srsly-2.5.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:99026bcd9cbd3211cc36517400b04ca0fc5d3e412b14daf84ee6e65f67d9a2d8", size = 1180873, upload-time = "2026-03-23T11:56:03.944Z" },
+    { url = "https://files.pythonhosted.org/packages/89/57/5554f786eccf78b2750d6ac63be126e1b67badec2cb409dd611cf6f8c52b/srsly-2.5.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:07d682679e639eb46ff7e6da4a92714f4d5ffe351d088ee66f221e9b1f8865bb", size = 1120437, upload-time = "2026-03-23T11:56:05.283Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/95/9b4f73b1be3692f86d72ccc131c8e50f26f824d5c8830a59390bcc5b60ef/srsly-2.5.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8e0542d85d6b55cf2934050d6ffcb1cd76c768dcf9572e7467002cf087bb366d", size = 1137376, upload-time = "2026-03-23T11:56:06.613Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/de/89ca640ca1953c4612279ce515d0af35658df3c06cdb324329bc91b4a7e1/srsly-2.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:598f1e494c18cacb978299d77125415a586417081959f8ec3f068b32d97f8933", size = 652459, upload-time = "2026-03-23T11:56:07.994Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/4f/7ab6d49e36d9cc72ee15746cabd116eb6f338be8a06c1882968ee9d6c7d7/srsly-2.5.3-cp312-cp312-win_arm64.whl", hash = "sha256:4b1b721cd3ad1a9b2343519aadc786a4d09d5c0666962d49852eb12d6ec3fe26", size = 638411, upload-time = "2026-03-23T11:56:09.31Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/5c/12901e3794f4158abc6da750725aad6c2afddb1e4227b300fe7c71f66957/srsly-2.5.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e67b6bbacbfadea5e100266d2797f2d4cec9883ea4dc84a5537673850036a8d8", size = 656750, upload-time = "2026-03-23T11:56:10.708Z" },
+    { url = "https://files.pythonhosted.org/packages/04/61/181c26370995f96f56f1b64b801e3ca1e0d703fc36506ae28606d62369fb/srsly-2.5.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:348c231b4477d8fe86603131d0f166d2feac9c372704dfc4398be71cc5b6fb07", size = 656746, upload-time = "2026-03-23T11:56:12.28Z" },
+    { url = "https://files.pythonhosted.org/packages/77/c6/35876c78889f8ffe11ed3521644e666c3aef20ea31527b70f47456cf35c2/srsly-2.5.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b0938c2978c91ae1ef9c1f2ba35abb86330e198fb23469e356eba311e02233ee", size = 1155762, upload-time = "2026-03-23T11:56:14.075Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/da/40b71ca9906c8eb8f8feb6ac11d33dad458c85a56e1de764b96d402168a0/srsly-2.5.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f6a837954429ecbe6dcdd27390d2fb4c7d01a3f99c9ffcf9ce66b2a6dd1b738", size = 1161092, upload-time = "2026-03-23T11:56:15.778Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/14/c0dd30cc8b93ce8137ff4766f743c882440ce49195fffc5d50eaeef311a6/srsly-2.5.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3576c125c486ce2958c2047e8858fe3cfc9ea877adfa05203b0986f9badee355", size = 1109984, upload-time = "2026-03-23T11:56:17.056Z" },
+    { url = "https://files.pythonhosted.org/packages/08/f3/34354f183d8faafc631585571224b54d1b4b67e796972c36519c074ca355/srsly-2.5.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fb59c42922e095d1ea36085c55bc16e2adb06a7bfe57b24d381e0194ae699f2", size = 1128409, upload-time = "2026-03-23T11:56:18.761Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/d9/5531f8a19492060b4e76e4ab06aca6f096fb5128fe18cc813d1772daf653/srsly-2.5.3-cp313-cp313-win_amd64.whl", hash = "sha256:111805927f05f5db440aeeacb85ce43da0b19ce7b2a09567a9ef8d30f3cc4d83", size = 650820, upload-time = "2026-03-23T11:56:20.096Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/8a/62fb7a971eca29e12f03fb9ddacb058548c14d33e5b5675ff0f85839cc7b/srsly-2.5.3-cp313-cp313-win_arm64.whl", hash = "sha256:0f106b0a700ab56e4a7c431b0f1444009ab6cb332edc7bbf6811c2a43f4722cb", size = 637278, upload-time = "2026-03-23T11:56:21.439Z" },
+]
+
 [[package]]
 name = "sse-starlette"
 version = "3.4.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "anyio", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "starlette", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "anyio", marker = "python_full_version >= '3.11'" },
+    { name = "starlette", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e1/9a/f35932a8c0eb6b2287b66fa65a0321df8c84e4e355a659c1841a37c39fdb/sse_starlette-3.4.1.tar.gz", hash = "sha256:f780bebcf6c8997fe514e3bd8e8c648d8284976b391c8bed0bcb1f611632b555", size = 35127, upload-time = "2026-04-26T13:32:32.292Z" }
 wheels = [
@@ -7202,12 +6933,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" },
 ]
 
+[[package]]
+name = "structlog"
+version = "25.5.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ef/52/9ba0f43b686e7f3ddfeaa78ac3af750292662284b3661e91ad5494f21dbc/structlog-25.5.0.tar.gz", hash = "sha256:098522a3bebed9153d4570c6d0288abf80a031dfdb2048d59a49e9dc2190fc98", size = 1460830, upload-time = "2025-10-27T08:28:23.028Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a8/45/a132b9074aa18e799b891b91ad72133c98d8042c70f6240e4c5f9dabee2f/structlog-25.5.0-py3-none-any.whl", hash = "sha256:a8453e9b9e636ec59bd9e79bbd4a72f025981b3ba0f5837aebf48f02f37a7f9f", size = 72510, upload-time = "2025-10-27T08:28:21.535Z" },
+]
+
 [[package]]
 name = "sympy"
 version = "1.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mpmath", marker = "python_full_version >= '3.11'" },
+    { name = "mpmath" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
 wheels = [
@@ -7250,6 +6993,61 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/47/34/4f1bad936ac3ad94c8576b15660d4ce434f7dbd372baa53566a490bcdce3/textual-6.8.0-py3-none-any.whl", hash = "sha256:074d389ba8c6c98c74e2a4fe1493ea3a38f3ee5008697e98f71daa2cf8ab8fda", size = 714378, upload-time = "2025-12-07T17:53:44.501Z" },
 ]
 
+[[package]]
+name = "thinc"
+version = "8.3.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "blis", marker = "python_full_version >= '3.11'" },
+    { name = "catalogue", marker = "python_full_version >= '3.11'" },
+    { name = "confection", marker = "python_full_version >= '3.11'" },
+    { name = "cymem", marker = "python_full_version >= '3.11'" },
+    { name = "murmurhash", marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+    { name = "packaging", marker = "python_full_version >= '3.11'" },
+    { name = "preshed", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "setuptools", version = "81.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "82.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version == '3.11.*' and platform_machine == 'x86_64' and sys_platform == 'linux') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
+    { name = "srsly", marker = "python_full_version >= '3.11'" },
+    { name = "wasabi", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/46/76df95f2c327f9a9cef30c1523bf285627897097163584dcf5f77b2ebce2/thinc-8.3.13.tar.gz", hash = "sha256:68e658549fc1eb3ff92aed5147fcbb9c15d6e9cc0e623b4d0998d16522ffb4f9", size = 194640, upload-time = "2026-03-23T07:22:36.41Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ad/e3/df570d55f38250d153e209d998f60e334026ea60cf9a887cffb85d7ee9bf/thinc-8.3.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84fb50fe572a1860165f2e7a640c7cb70d43d6962366e69f643fa9a27e4a2127", size = 846996, upload-time = "2026-03-23T07:21:32.701Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/72/e97c9cb863ef0a645ba069c24e0981bfaedf8241ba199512ebcd64ba090a/thinc-8.3.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3dac18a0fb0a42f711c2ce9c02cbb090385aecae92089aa17b9dfd808a542013", size = 815368, upload-time = "2026-03-23T07:21:34.392Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/7a/9283f52b1210dc052b795e22ec739d13929b914d1289e49336bede34c4eb/thinc-8.3.13-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e08b1577a56e7315770af280aabd8fa5f2a1fb6afd1c50a4183c06e907faf558", size = 3885033, upload-time = "2026-03-23T07:21:35.772Z" },
+    { url = "https://files.pythonhosted.org/packages/93/9a/aa8f2e19819c02781b282c3a9cfb57c76ff1fbe0b6deaa1ffd04dc920894/thinc-8.3.13-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:303477eb51b9b39c94a7fc7967ee8a039eca1ca37d95dcce1234c83b95b4ee9f", size = 3912947, upload-time = "2026-03-23T07:21:37.219Z" },
+    { url = "https://files.pythonhosted.org/packages/51/fa/ea7c67667b8a875178bea5a42dc9c8b0622c34e7eba3d8e42874f2c4b4c1/thinc-8.3.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d7a9654f9ca362a4be7f5e590fdfee26e2e2084da9fd3306032ec037e99f2f8e", size = 4887518, upload-time = "2026-03-23T07:21:38.76Z" },
+    { url = "https://files.pythonhosted.org/packages/36/44/99c391e951e3b706b9a7552ced720e9ec3bddd6707a99d53e4354ebefa45/thinc-8.3.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e1f8d13bf92ee10595c40692fd4cf8e7bbe73bd9f260107e975fd5dbee1af42b", size = 5044691, upload-time = "2026-03-23T07:21:40.257Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/cf/9d95fb5f12d76ad1c7570a9a38da2f2f60dba721c87630bfabaabef91bc3/thinc-8.3.13-cp310-cp310-win_amd64.whl", hash = "sha256:e7f046d8914055cad51e83ff0da1a892acb73cd58556d7c1a5d4015a3766a899", size = 1795372, upload-time = "2026-03-23T07:21:41.709Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/72/ca06842a007e8c794e8c59462f242cdfd6167d7cc9d0155ad004b194b015/thinc-8.3.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4565102638038a01a2193c7f5d41ccbd6233fbdcb1f1b184322a06add4f51f18", size = 844359, upload-time = "2026-03-23T07:21:43.017Z" },
+    { url = "https://files.pythonhosted.org/packages/48/44/e6aef092f478d263f72eb3933b55a6f37ba97c6a0ea0a61d13fbf9bf0c19/thinc-8.3.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:859fbd9d9b16af5278da23589b4afbe2ab6b0dd615df4d3229b7c4e67cd3107e", size = 812089, upload-time = "2026-03-23T07:21:44.618Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/8a/9ce0424d456cd3580cc3a855b23a7ff86b81d5299fceb496a2f56f06c1c0/thinc-8.3.13-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a518d5c761a0f2341e530e867de133dc3ed814558365b2a68ec53b89c482a43f", size = 4101388, upload-time = "2026-03-23T07:21:46.135Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/51/ec91c0434bd9a1096ab874bbd6dc110c5089d7fc513137e6af59bd051eec/thinc-8.3.13-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:81337dfbee37f58f36c0c70f9a819dce1b32cdc13d959181e10de079621f6ac6", size = 4131972, upload-time = "2026-03-23T07:21:48.403Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/67/e30dea753c90cff5cb9e5feb34948fdb89a6774b84d849585b49e16a730e/thinc-8.3.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fbc0ee16edd260c6a4a9e365ff36d0a682c9e7ca6d7b985682659ef2e3e73826", size = 5101283, upload-time = "2026-03-23T07:21:49.991Z" },
+    { url = "https://files.pythonhosted.org/packages/00/e9/b7544eddababa16e548b26a96fff29eeb307ce938df5fa4af9371fe8ed5d/thinc-8.3.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0355c37e40d1a9fc2a1b8e9c2e294d8586f6baa97bcac6b9002f2dddb4b82ae9", size = 5264488, upload-time = "2026-03-23T07:21:51.747Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/a9/49391a40d703efc0f7a451310373261835f71fd3e6e2e8cfc08ee02f78ad/thinc-8.3.13-cp311-cp311-win_amd64.whl", hash = "sha256:0a0fa13dcfe4b319c3a396432c1dbff30d3de37dbbdee559e76600ee2b9486df", size = 1795058, upload-time = "2026-03-23T07:21:53.424Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/31/fd5348d44beda12a3ee415cbba9ed4fd0b17ce65db1d473c38a29a8d6153/thinc-8.3.13-cp311-cp311-win_arm64.whl", hash = "sha256:cd8a2b714c061969eee65802965167a6ada1fe708d82fe176d98dcb95ebe182a", size = 1721215, upload-time = "2026-03-23T07:21:55.027Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/af/f7c1ebfe92eb5d27d7f2f3da67a11e2eb57bc30ab1553279af6dc65b65a8/thinc-8.3.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:77a41f66285321d20aaedaea1e87d7cd48dca6d2427bed1867ec7cba7109fc8d", size = 821097, upload-time = "2026-03-23T07:21:56.698Z" },
+    { url = "https://files.pythonhosted.org/packages/45/8f/69d7338575d98df85d0b54c0f5fc277dba72587fe9ab846ecdd12a998bcb/thinc-8.3.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3710d318b4e5460cf366a6f7b5ddbefb5d39dbd4cfa408222750fdc6c27c4411", size = 791932, upload-time = "2026-03-23T07:21:58.38Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/a5/21d010c81e81e1589e5ccb4950e521804d13726e541e87f644c51815673b/thinc-8.3.13-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5a08c87143a6d20177652dca1ec0dc815d88216d8fc62594a57e8bc45bf5ed49", size = 3854219, upload-time = "2026-03-23T07:21:59.819Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/ff/6914bf370bd1d604d89e6dfb46b97d10cd9b00d42ff8c036283e92314a8c/thinc-8.3.13-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4b5ec9ff313819e7d8667794a3559463fa89ff45aaa73e3fd8d6273b1e0d7a7f", size = 3903307, upload-time = "2026-03-23T07:22:01.652Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/3d/5572b47fa155fb3388c071515b74024fa17a6efd1df9406da378f0aa84ef/thinc-8.3.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5c9a48f2bc1e04f138240ed5f9b815a9141a5de26accd0f08fa0137fcefed258", size = 4836882, upload-time = "2026-03-23T07:22:03.565Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/f0/a8d77c7bac089697c6df302cc3c936a1ab36a4720deae889e6f1dbcbd0eb/thinc-8.3.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:79a29a44d76bd02f5ac0624268c6e42b3576ae472c791a8ae9c2d813ae789b59", size = 5033398, upload-time = "2026-03-23T07:22:05.045Z" },
+    { url = "https://files.pythonhosted.org/packages/21/82/5651bb1f904d04220fc7670035ada921bf0638e2cff6444d67c12887a968/thinc-8.3.13-cp312-cp312-win_amd64.whl", hash = "sha256:ed1dc709ac4f2f03b710457889e4e02f05de51bc8456980c241d0b28798bc7cb", size = 1721248, upload-time = "2026-03-23T07:22:06.749Z" },
+    { url = "https://files.pythonhosted.org/packages/94/8d/683703de021ffbe46833d722b70f49ffbbca8e5bd6876256977555d92d7d/thinc-8.3.13-cp312-cp312-win_arm64.whl", hash = "sha256:c6a049703a6011c8fe26ee41af7e70272145594140d82f79bb23de619c6a6525", size = 1645777, upload-time = "2026-03-23T07:22:08.104Z" },
+    { url = "https://files.pythonhosted.org/packages/af/b9/7b46942176df459d1804a9e77b0976f7c56f3abf3ec7485d0e5f836a0382/thinc-8.3.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c2811dfd8d46d8b5d3b39051b23e64006b2994a5143b1978b436938018792af8", size = 817337, upload-time = "2026-03-23T07:22:09.538Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/79/53085a72cd8f4fc4e6e313d05ea5aa98e870684f4a0fb318a9875fc0a964/thinc-8.3.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5593e6300cb1ebe0c0e546e9c9fb49e7c2627a0aa688795cd4f995a8b820d2ec", size = 788120, upload-time = "2026-03-23T07:22:11.215Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/3e/d61b462b16da95ac6885f95bb395e672040ee594833e571a6edcffd234f5/thinc-8.3.13-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f697174d3fb474966ce50b430bbafa101a6d2f7ffb559dac4b5c59389ef72d22", size = 3844666, upload-time = "2026-03-23T07:22:12.67Z" },
+    { url = "https://files.pythonhosted.org/packages/78/4c/898cc654bb123734c71ec5a425c02ca34439517d01ce1c95a6563295580e/thinc-8.3.13-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9c7c5c104737b414c8c4ec578e67d78b6c859afe25cbc0684402e721415bd7f", size = 3890658, upload-time = "2026-03-23T07:22:14.668Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/56/1abdbf0a4ad628e8a05d6516fe0745969649d805367a3dccad8ee872981b/thinc-8.3.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7a99d0e242d1ccd23f9ae6bea7cd502f8626efa65c156b91d84581d0356696c3", size = 4819933, upload-time = "2026-03-23T07:22:16.85Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/22/b84dbdc6be5055bbdb2a7352e2c393f67e8593c137f1b83c82bf1e062b6e/thinc-8.3.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e676edd21a747afbe3e6b9f3fca8b962e36d146ded03b070cb0c28e2dfbe9499", size = 5018099, upload-time = "2026-03-23T07:22:18.356Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/a8/763cd7ba949334c9d2cddc92dadb68b344cb9546dc01b8d4a733dcaa16c1/thinc-8.3.13-cp313-cp313-win_amd64.whl", hash = "sha256:8ad40307f20e83f77af28ff5c6be0b86af7a8b251d1231c545508d2763157d8f", size = 1720309, upload-time = "2026-03-23T07:22:19.81Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/15/a11f7bb3cbc97dfecf32a90552f5a8f8a5c99316a99c6c17bdabf5baf256/thinc-8.3.13-cp313-cp313-win_arm64.whl", hash = "sha256:723949cab11d1925c15447928513a718276316cec6e0de28337cca0a62be0521", size = 1644606, upload-time = "2026-03-23T07:22:21.339Z" },
+]
+
 [[package]]
 name = "threadpoolctl"
 version = "3.6.0"
@@ -7264,8 +7062,8 @@ name = "tiktoken"
 version = "0.12.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "regex", marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
-    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
+    { name = "regex", marker = "python_full_version >= '3.11'" },
+    { name = "requests", version = "2.33.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
 wheels = [
@@ -7304,20 +7102,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" },
     { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" },
     { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" },
-    { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" },
-    { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" },
-    { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" },
-    { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" },
-    { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
-    { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
 ]
 
 [[package]]
@@ -7384,22 +7168,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/55/92/afed3d497f7c186dc71e6ee6d4fcb0acfa5f7d0a1a2878f8beae379ae0cc/tomli-2.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ad805ea85eda330dbad64c7ea7a4556259665bdf9d2672f5dccc740eb9d3ca05", size = 248909, upload-time = "2025-10-08T22:01:23.859Z" },
     { url = "https://files.pythonhosted.org/packages/f8/84/ef50c51b5a9472e7265ce1ffc7f24cd4023d289e109f669bdb1553f6a7c2/tomli-2.3.0-cp313-cp313-win32.whl", hash = "sha256:97d5eec30149fd3294270e889b4234023f2c69747e555a27bd708828353ab606", size = 96946, upload-time = "2025-10-08T22:01:24.893Z" },
     { url = "https://files.pythonhosted.org/packages/b2/b7/718cd1da0884f281f95ccfa3a6cc572d30053cba64603f79d431d3c9b61b/tomli-2.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0c95ca56fbe89e065c6ead5b593ee64b84a26fca063b5d71a1122bf26e533999", size = 107705, upload-time = "2025-10-08T22:01:26.153Z" },
-    { url = "https://files.pythonhosted.org/packages/19/94/aeafa14a52e16163008060506fcb6aa1949d13548d13752171a755c65611/tomli-2.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:cebc6fe843e0733ee827a282aca4999b596241195f43b4cc371d64fc6639da9e", size = 154244, upload-time = "2025-10-08T22:01:27.06Z" },
-    { url = "https://files.pythonhosted.org/packages/db/e4/1e58409aa78eefa47ccd19779fc6f36787edbe7d4cd330eeeedb33a4515b/tomli-2.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4c2ef0244c75aba9355561272009d934953817c49f47d768070c3c94355c2aa3", size = 148637, upload-time = "2025-10-08T22:01:28.059Z" },
-    { url = "https://files.pythonhosted.org/packages/26/b6/d1eccb62f665e44359226811064596dd6a366ea1f985839c566cd61525ae/tomli-2.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c22a8bf253bacc0cf11f35ad9808b6cb75ada2631c2d97c971122583b129afbc", size = 241925, upload-time = "2025-10-08T22:01:29.066Z" },
-    { url = "https://files.pythonhosted.org/packages/70/91/7cdab9a03e6d3d2bb11beae108da5bdc1c34bdeb06e21163482544ddcc90/tomli-2.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0eea8cc5c5e9f89c9b90c4896a8deefc74f518db5927d0e0e8d4a80953d774d0", size = 249045, upload-time = "2025-10-08T22:01:31.98Z" },
-    { url = "https://files.pythonhosted.org/packages/15/1b/8c26874ed1f6e4f1fcfeb868db8a794cbe9f227299402db58cfcc858766c/tomli-2.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b74a0e59ec5d15127acdabd75ea17726ac4c5178ae51b85bfe39c4f8a278e879", size = 245835, upload-time = "2025-10-08T22:01:32.989Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/42/8e3c6a9a4b1a1360c1a2a39f0b972cef2cc9ebd56025168c4137192a9321/tomli-2.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b5870b50c9db823c595983571d1296a6ff3e1b88f734a4c8f6fc6188397de005", size = 253109, upload-time = "2025-10-08T22:01:34.052Z" },
-    { url = "https://files.pythonhosted.org/packages/22/0c/b4da635000a71b5f80130937eeac12e686eefb376b8dee113b4a582bba42/tomli-2.3.0-cp314-cp314-win32.whl", hash = "sha256:feb0dacc61170ed7ab602d3d972a58f14ee3ee60494292d384649a3dc38ef463", size = 97930, upload-time = "2025-10-08T22:01:35.082Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/74/cb1abc870a418ae99cd5c9547d6bce30701a954e0e721821df483ef7223c/tomli-2.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:b273fcbd7fc64dc3600c098e39136522650c49bca95df2d11cf3b626422392c8", size = 107964, upload-time = "2025-10-08T22:01:36.057Z" },
-    { url = "https://files.pythonhosted.org/packages/54/78/5c46fff6432a712af9f792944f4fcd7067d8823157949f4e40c56b8b3c83/tomli-2.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:940d56ee0410fa17ee1f12b817b37a4d4e4dc4d27340863cc67236c74f582e77", size = 163065, upload-time = "2025-10-08T22:01:37.27Z" },
-    { url = "https://files.pythonhosted.org/packages/39/67/f85d9bd23182f45eca8939cd2bc7050e1f90c41f4a2ecbbd5963a1d1c486/tomli-2.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f85209946d1fe94416debbb88d00eb92ce9cd5266775424ff81bc959e001acaf", size = 159088, upload-time = "2025-10-08T22:01:38.235Z" },
-    { url = "https://files.pythonhosted.org/packages/26/5a/4b546a0405b9cc0659b399f12b6adb750757baf04250b148d3c5059fc4eb/tomli-2.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a56212bdcce682e56b0aaf79e869ba5d15a6163f88d5451cbde388d48b13f530", size = 268193, upload-time = "2025-10-08T22:01:39.712Z" },
-    { url = "https://files.pythonhosted.org/packages/42/4f/2c12a72ae22cf7b59a7fe75b3465b7aba40ea9145d026ba41cb382075b0e/tomli-2.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5f3ffd1e098dfc032d4d3af5c0ac64f6d286d98bc148698356847b80fa4de1b", size = 275488, upload-time = "2025-10-08T22:01:40.773Z" },
-    { url = "https://files.pythonhosted.org/packages/92/04/a038d65dbe160c3aa5a624e93ad98111090f6804027d474ba9c37c8ae186/tomli-2.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5e01decd096b1530d97d5d85cb4dff4af2d8347bd35686654a004f8dea20fc67", size = 272669, upload-time = "2025-10-08T22:01:41.824Z" },
-    { url = "https://files.pythonhosted.org/packages/be/2f/8b7c60a9d1612a7cbc39ffcca4f21a73bf368a80fc25bccf8253e2563267/tomli-2.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8a35dd0e643bb2610f156cca8db95d213a90015c11fee76c946aa62b7ae7e02f", size = 279709, upload-time = "2025-10-08T22:01:43.177Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/46/cc36c679f09f27ded940281c38607716c86cf8ba4a518d524e349c8b4874/tomli-2.3.0-cp314-cp314t-win32.whl", hash = "sha256:a1f7f282fe248311650081faafa5f4732bdbfef5d45fe3f2e702fbc6f2d496e0", size = 107563, upload-time = "2025-10-08T22:01:44.233Z" },
-    { url = "https://files.pythonhosted.org/packages/84/ff/426ca8683cf7b753614480484f6437f568fd2fda2edbdf57a2d3d8b27a0b/tomli-2.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:70a251f8d4ba2d9ac2542eecf008b3c8a9fc5c3f9f02c56a9d7952612be2fdba", size = 119756, upload-time = "2025-10-08T22:01:45.234Z" },
     { url = "https://files.pythonhosted.org/packages/77/b8/0135fadc89e73be292b473cb820b4f5a08197779206b33191e801feeae40/tomli-2.3.0-py3-none-any.whl", hash = "sha256:e95b1af3c5b07d9e643909b5abbec77cd9f1217e6d0bca72b0234736b9fb1f1b", size = 14408, upload-time = "2025-10-08T22:01:46.04Z" },
 ]
 
@@ -7426,31 +7194,33 @@ name = "torch"
 version = "2.10.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-bindings", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "filelock", marker = "python_full_version >= '3.11'" },
+    { name = "cuda-bindings", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "filelock" },
     { name = "fsspec", version = "2025.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "jinja2", marker = "python_full_version >= '3.11'" },
+    { name = "fsspec", version = "2025.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "jinja2" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
-    { name = "nvidia-cublas-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufile-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvshmem-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "setuptools", version = "81.0.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.12' and python_full_version < '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux') or (python_full_version >= '3.12' and platform_machine == 'aarch64' and sys_platform == 'linux')" },
-    { name = "setuptools", version = "82.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.12' and platform_machine != 'aarch64' and platform_machine != 'x86_64') or (python_full_version >= '3.14' and platform_machine == 'x86_64' and sys_platform == 'linux') or (python_full_version >= '3.12' and sys_platform != 'linux')" },
-    { name = "sympy", marker = "python_full_version >= '3.11'" },
-    { name = "triton", marker = "python_full_version >= '3.11' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "81.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", version = "82.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.12' and platform_machine != 'x86_64') or (python_full_version >= '3.12' and sys_platform != 'linux')" },
+    { name = "sympy" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "typing-extensions" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/30/bfebdd8ec77db9a79775121789992d6b3b75ee5494971294d7b4b7c999bc/torch-2.10.0-2-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2b980edd8d7c0a68c4e951ee1856334a43193f98730d97408fbd148c1a933313", size = 79411457, upload-time = "2026-02-10T21:44:59.189Z" },
@@ -7462,8 +7232,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" },
     { url = "https://files.pythonhosted.org/packages/ab/c6/4dfe238342ffdcec5aef1c96c457548762d33c40b45a1ab7033bb26d2ff2/torch-2.10.0-3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:80b1b5bfe38eb0e9f5ff09f206dcac0a87aadd084230d4a36eea5ec5232c115b", size = 915627275, upload-time = "2026-03-11T14:16:11.325Z" },
     { url = "https://files.pythonhosted.org/packages/d8/f0/72bf18847f58f877a6a8acf60614b14935e2f156d942483af1ffc081aea0/torch-2.10.0-3-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:46b3574d93a2a8134b3f5475cfb98e2eb46771794c57015f6ad1fb795ec25e49", size = 915523474, upload-time = "2026-03-11T14:17:44.422Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/39/590742415c3030551944edc2ddc273ea1fdfe8ffb2780992e824f1ebee98/torch-2.10.0-3-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b1d5e2aba4eb7f8e87fbe04f86442887f9167a35f092afe4c237dfcaaef6e328", size = 915632474, upload-time = "2026-03-11T14:15:13.666Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/8e/34949484f764dde5b222b7fe3fede43e4a6f0da9d7f8c370bb617d629ee2/torch-2.10.0-3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0228d20b06701c05a8f978357f657817a4a63984b0c90745def81c18aedfa591", size = 915523882, upload-time = "2026-03-11T14:14:46.311Z" },
     { url = "https://files.pythonhosted.org/packages/0c/1a/c61f36cfd446170ec27b3a4984f072fd06dab6b5d7ce27e11adb35d6c838/torch-2.10.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:5276fa790a666ee8becaffff8acb711922252521b28fbce5db7db5cf9cb2026d", size = 145992962, upload-time = "2026-01-21T16:24:14.04Z" },
     { url = "https://files.pythonhosted.org/packages/b5/60/6662535354191e2d1555296045b63e4279e5a9dbad49acf55a5d38655a39/torch-2.10.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aaf663927bcd490ae971469a624c322202a2a1e68936eb952535ca4cd3b90444", size = 915599237, upload-time = "2026-01-21T16:23:25.497Z" },
     { url = "https://files.pythonhosted.org/packages/40/b8/66bbe96f0d79be2b5c697b2e0b187ed792a15c6c4b8904613454651db848/torch-2.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:a4be6a2a190b32ff5c8002a0977a25ea60e64f7ba46b1be37093c141d9c49aeb", size = 113720931, upload-time = "2026-01-21T16:24:23.743Z" },
@@ -7484,14 +7252,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/54/fd/b207d1c525cb570ef47f3e9f836b154685011fce11a2f444ba8a4084d042/torch-2.10.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6021db85958db2f07ec94e1bc77212721ba4920c12a18dc552d2ae36a3eb163f", size = 915612644, upload-time = "2026-01-21T16:21:47.019Z" },
     { url = "https://files.pythonhosted.org/packages/36/53/0197f868c75f1050b199fe58f9bf3bf3aecac9b4e85cc9c964383d745403/torch-2.10.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff43db38af76fda183156153983c9a096fc4c78d0cd1e07b14a2314c7f01c2c8", size = 113997015, upload-time = "2026-01-21T16:23:00.767Z" },
     { url = "https://files.pythonhosted.org/packages/0e/13/e76b4d9c160e89fff48bf16b449ea324bda84745d2ab30294c37c2434c0d/torch-2.10.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:cdf2a523d699b70d613243211ecaac14fe9c5df8a0b0a9c02add60fb2a413e0f", size = 79498248, upload-time = "2026-01-21T16:23:09.315Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/93/716b5ac0155f1be70ed81bacc21269c3ece8dba0c249b9994094110bfc51/torch-2.10.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:bf0d9ff448b0218e0433aeb198805192346c4fd659c852370d5cc245f602a06a", size = 79464992, upload-time = "2026-01-21T16:23:05.162Z" },
-    { url = "https://files.pythonhosted.org/packages/69/2b/51e663ff190c9d16d4a8271203b71bc73a16aa7619b9f271a69b9d4a936b/torch-2.10.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:233aed0659a2503b831d8a67e9da66a62c996204c0bba4f4c442ccc0c68a3f60", size = 146018567, upload-time = "2026-01-21T16:22:23.393Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/cd/4b95ef7f293b927c283db0b136c42be91c8ec6845c44de0238c8c23bdc80/torch-2.10.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:682497e16bdfa6efeec8cde66531bc8d1fbbbb4d8788ec6173c089ed3cc2bfe5", size = 915721646, upload-time = "2026-01-21T16:21:16.983Z" },
-    { url = "https://files.pythonhosted.org/packages/56/97/078a007208f8056d88ae43198833469e61a0a355abc0b070edd2c085eb9a/torch-2.10.0-cp314-cp314-win_amd64.whl", hash = "sha256:6528f13d2a8593a1a412ea07a99812495bec07e9224c28b2a25c0a30c7da025c", size = 113752373, upload-time = "2026-01-21T16:22:13.471Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/94/71994e7d0d5238393df9732fdab607e37e2b56d26a746cb59fdb415f8966/torch-2.10.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:f5ab4ba32383061be0fb74bda772d470140a12c1c3b58a0cfbf3dae94d164c28", size = 79850324, upload-time = "2026-01-21T16:22:09.494Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/65/1a05346b418ea8ccd10360eef4b3e0ce688fba544e76edec26913a8d0ee0/torch-2.10.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:716b01a176c2a5659c98f6b01bf868244abdd896526f1c692712ab36dbaf9b63", size = 146006482, upload-time = "2026-01-21T16:22:18.42Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/b9/5f6f9d9e859fc3235f60578fa64f52c9c6e9b4327f0fe0defb6de5c0de31/torch-2.10.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:d8f5912ba938233f86361e891789595ff35ca4b4e2ac8fe3670895e5976731d6", size = 915613050, upload-time = "2026-01-21T16:20:49.035Z" },
-    { url = "https://files.pythonhosted.org/packages/66/4d/35352043ee0eaffdeff154fad67cd4a31dbed7ff8e3be1cc4549717d6d51/torch-2.10.0-cp314-cp314t-win_amd64.whl", hash = "sha256:71283a373f0ee2c89e0f0d5f446039bdabe8dbc3c9ccf35f0f784908b0acd185", size = 113995816, upload-time = "2026-01-21T16:22:05.312Z" },
 ]
 
 [[package]]
@@ -7539,13 +7299,13 @@ wheels = [
 
 [[package]]
 name = "transformers"
-version = "4.57.3"
+version = "4.57.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "huggingface-hub" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -7555,9 +7315,9 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" },
+    { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" },
 ]
 
 [[package]]
@@ -7584,8 +7344,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" },
     { url = "https://files.pythonhosted.org/packages/f9/0b/37d991d8c130ce81a8728ae3c25b6e60935838e9be1b58791f5997b24a54/triton-3.6.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c7f76c6e72d2ef08df639e3d0d30729112f47a56b0c81672edc05ee5116ac9", size = 188289450, upload-time = "2026-01-20T16:00:49.136Z" },
     { url = "https://files.pythonhosted.org/packages/35/f8/9c66bfc55361ec6d0e4040a0337fb5924ceb23de4648b8a81ae9d33b2b38/triton-3.6.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d002e07d7180fd65e622134fbd980c9a3d4211fb85224b56a0a0efbd422ab72f", size = 188400296, upload-time = "2026-01-20T16:00:56.042Z" },
-    { url = "https://files.pythonhosted.org/packages/df/3d/9e7eee57b37c80cec63322c0231bb6da3cfe535a91d7a4d64896fcb89357/triton-3.6.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a17a5d5985f0ac494ed8a8e54568f092f7057ef60e1b0fa09d3fd1512064e803", size = 188273063, upload-time = "2026-01-20T16:01:07.278Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/56/6113c23ff46c00aae423333eb58b3e60bdfe9179d542781955a5e1514cb3/triton-3.6.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46bd1c1af4b6704e554cad2eeb3b0a6513a980d470ccfa63189737340c7746a7", size = 188397994, upload-time = "2026-01-20T16:01:14.236Z" },
 ]
 
 [[package]]
@@ -7675,11 +7433,11 @@ wheels = [
 
 [[package]]
 name = "urllib3"
-version = "1.26.20"
+version = "2.7.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" },
 ]
 
 [[package]]
@@ -7711,6 +7469,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/0c/c05523fa3181fdf0c9c52a6ba91a23fbf3246cc095f26f6516f9c60e6771/virtualenv-20.35.4-py3-none-any.whl", hash = "sha256:c21c9cede36c9753eeade68ba7d523529f228a403463376cf821eaae2b650f1b", size = 6005095, upload-time = "2025-10-29T06:57:37.598Z" },
 ]
 
+[[package]]
+name = "waitress"
+version = "3.0.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/cb/04ddb054f45faa306a230769e868c28b8065ea196891f09004ebace5b184/waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f", size = 179901, upload-time = "2024-11-16T20:02:35.195Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/57/a27182528c90ef38d82b636a11f606b0cbb0e17588ed205435f8affe3368/waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e", size = 56232, upload-time = "2024-11-16T20:02:33.858Z" },
+]
+
 [[package]]
 name = "wandb"
 version = "0.23.1"
@@ -7741,6 +7508,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/30/20/6c091d451e2a07689bfbfaeb7592d488011420e721de170884fedd68c644/wandb-0.23.1-py3-none-win_arm64.whl", hash = "sha256:8aee7f3bb573f2c0acf860f497ca9c684f9b35f2ca51011ba65af3d4592b77c1", size = 20137463, upload-time = "2025-12-03T02:25:08.317Z" },
 ]
 
+[[package]]
+name = "wasabi"
+version = "1.1.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "python_full_version >= '3.11' and sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ac/f9/054e6e2f1071e963b5e746b48d1e3727470b2a490834d18ad92364929db3/wasabi-1.1.3.tar.gz", hash = "sha256:4bb3008f003809db0c3e28b4daf20906ea871a2bb43f9914197d540f4f2e0878", size = 30391, upload-time = "2024-05-31T16:56:18.99Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/7c/34330a89da55610daa5f245ddce5aab81244321101614751e7537f125133/wasabi-1.1.3-py3-none-any.whl", hash = "sha256:f76e16e8f7e79f8c4c8be49b4024ac725713ab10cd7f19350ad18a8e3f71728c", size = 27880, upload-time = "2024-05-31T16:56:16.699Z" },
+]
+
 [[package]]
 name = "watchfiles"
 version = "1.1.1"
@@ -7811,29 +7590,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/f7/0a4467be0a56e80447c8529c9fce5b38eab4f513cb3d9bf82e7392a5696b/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7eb7da0eb23aa2ba036d4f616d46906013a68caf61b7fdbe42fc8b25132e77", size = 455425, upload-time = "2025-10-14T15:05:23.348Z" },
     { url = "https://files.pythonhosted.org/packages/8e/e0/82583485ea00137ddf69bc84a2db88bd92ab4a6e3c405e5fb878ead8d0e7/watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:831a62658609f0e5c64178211c942ace999517f5770fe9436be4c2faeba0c0ef", size = 628826, upload-time = "2025-10-14T15:05:24.398Z" },
     { url = "https://files.pythonhosted.org/packages/28/9a/a785356fccf9fae84c0cc90570f11702ae9571036fb25932f1242c82191c/watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:f9a2ae5c91cecc9edd47e041a930490c31c3afb1f5e6d71de3dc671bfaca02bf", size = 622208, upload-time = "2025-10-14T15:05:25.45Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/f4/0872229324ef69b2c3edec35e84bd57a1289e7d3fe74588048ed8947a323/watchfiles-1.1.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:d1715143123baeeaeadec0528bb7441103979a1d5f6fd0e1f915383fea7ea6d5", size = 404315, upload-time = "2025-10-14T15:05:26.501Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/22/16d5331eaed1cb107b873f6ae1b69e9ced582fcf0c59a50cd84f403b1c32/watchfiles-1.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:39574d6370c4579d7f5d0ad940ce5b20db0e4117444e39b6d8f99db5676c52fd", size = 390869, upload-time = "2025-10-14T15:05:27.649Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/7e/5643bfff5acb6539b18483128fdc0ef2cccc94a5b8fbda130c823e8ed636/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7365b92c2e69ee952902e8f70f3ba6360d0d596d9299d55d7d386df84b6941fb", size = 449919, upload-time = "2025-10-14T15:05:28.701Z" },
-    { url = "https://files.pythonhosted.org/packages/51/2e/c410993ba5025a9f9357c376f48976ef0e1b1aefb73b97a5ae01a5972755/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bfff9740c69c0e4ed32416f013f3c45e2ae42ccedd1167ef2d805c000b6c71a5", size = 460845, upload-time = "2025-10-14T15:05:30.064Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/a4/2df3b404469122e8680f0fcd06079317e48db58a2da2950fb45020947734/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b27cf2eb1dda37b2089e3907d8ea92922b673c0c427886d4edc6b94d8dfe5db3", size = 489027, upload-time = "2025-10-14T15:05:31.064Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/84/4587ba5b1f267167ee715b7f66e6382cca6938e0a4b870adad93e44747e6/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:526e86aced14a65a5b0ec50827c745597c782ff46b571dbfe46192ab9e0b3c33", size = 595615, upload-time = "2025-10-14T15:05:32.074Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/0f/c6988c91d06e93cd0bb3d4a808bcf32375ca1904609835c3031799e3ecae/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04e78dd0b6352db95507fd8cb46f39d185cf8c74e4cf1e4fbad1d3df96faf510", size = 474836, upload-time = "2025-10-14T15:05:33.209Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/36/ded8aebea91919485b7bbabbd14f5f359326cb5ec218cd67074d1e426d74/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c85794a4cfa094714fb9c08d4a218375b2b95b8ed1666e8677c349906246c05", size = 455099, upload-time = "2025-10-14T15:05:34.189Z" },
-    { url = "https://files.pythonhosted.org/packages/98/e0/8c9bdba88af756a2fce230dd365fab2baf927ba42cd47521ee7498fd5211/watchfiles-1.1.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:74d5012b7630714b66be7b7b7a78855ef7ad58e8650c73afc4c076a1f480a8d6", size = 630626, upload-time = "2025-10-14T15:05:35.216Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/84/a95db05354bf2d19e438520d92a8ca475e578c647f78f53197f5a2f17aaf/watchfiles-1.1.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:8fbe85cb3201c7d380d3d0b90e63d520f15d6afe217165d7f98c9c649654db81", size = 622519, upload-time = "2025-10-14T15:05:36.259Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/ce/d8acdc8de545de995c339be67711e474c77d643555a9bb74a9334252bd55/watchfiles-1.1.1-cp314-cp314-win32.whl", hash = "sha256:3fa0b59c92278b5a7800d3ee7733da9d096d4aabcfabb9a928918bd276ef9b9b", size = 272078, upload-time = "2025-10-14T15:05:37.63Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/c9/a74487f72d0451524be827e8edec251da0cc1fcf111646a511ae752e1a3d/watchfiles-1.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:c2047d0b6cea13b3316bdbafbfa0c4228ae593d995030fda39089d36e64fc03a", size = 287664, upload-time = "2025-10-14T15:05:38.95Z" },
-    { url = "https://files.pythonhosted.org/packages/df/b8/8ac000702cdd496cdce998c6f4ee0ca1f15977bba51bdf07d872ebdfc34c/watchfiles-1.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:842178b126593addc05acf6fce960d28bc5fae7afbaa2c6c1b3a7b9460e5be02", size = 277154, upload-time = "2025-10-14T15:05:39.954Z" },
-    { url = "https://files.pythonhosted.org/packages/47/a8/e3af2184707c29f0f14b1963c0aace6529f9d1b8582d5b99f31bbf42f59e/watchfiles-1.1.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:88863fbbc1a7312972f1c511f202eb30866370ebb8493aef2812b9ff28156a21", size = 403820, upload-time = "2025-10-14T15:05:40.932Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/ec/e47e307c2f4bd75f9f9e8afbe3876679b18e1bcec449beca132a1c5ffb2d/watchfiles-1.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:55c7475190662e202c08c6c0f4d9e345a29367438cf8e8037f3155e10a88d5a5", size = 390510, upload-time = "2025-10-14T15:05:41.945Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/a0/ad235642118090f66e7b2f18fd5c42082418404a79205cdfca50b6309c13/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f53fa183d53a1d7a8852277c92b967ae99c2d4dcee2bfacff8868e6e30b15f7", size = 448408, upload-time = "2025-10-14T15:05:43.385Z" },
-    { url = "https://files.pythonhosted.org/packages/df/85/97fa10fd5ff3332ae17e7e40e20784e419e28521549780869f1413742e9d/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6aae418a8b323732fa89721d86f39ec8f092fc2af67f4217a2b07fd3e93c6101", size = 458968, upload-time = "2025-10-14T15:05:44.404Z" },
-    { url = "https://files.pythonhosted.org/packages/47/c2/9059c2e8966ea5ce678166617a7f75ecba6164375f3b288e50a40dc6d489/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f096076119da54a6080e8920cbdaac3dbee667eb91dcc5e5b78840b87415bd44", size = 488096, upload-time = "2025-10-14T15:05:45.398Z" },
-    { url = "https://files.pythonhosted.org/packages/94/44/d90a9ec8ac309bc26db808a13e7bfc0e4e78b6fc051078a554e132e80160/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00485f441d183717038ed2e887a7c868154f216877653121068107b227a2f64c", size = 596040, upload-time = "2025-10-14T15:05:46.502Z" },
-    { url = "https://files.pythonhosted.org/packages/95/68/4e3479b20ca305cfc561db3ed207a8a1c745ee32bf24f2026a129d0ddb6e/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a55f3e9e493158d7bfdb60a1165035f1cf7d320914e7b7ea83fe22c6023b58fc", size = 473847, upload-time = "2025-10-14T15:05:47.484Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/55/2af26693fd15165c4ff7857e38330e1b61ab8c37d15dc79118cdba115b7a/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c91ed27800188c2ae96d16e3149f199d62f86c7af5f5f4d2c61a3ed8cd3666c", size = 455072, upload-time = "2025-10-14T15:05:48.928Z" },
-    { url = "https://files.pythonhosted.org/packages/66/1d/d0d200b10c9311ec25d2273f8aad8c3ef7cc7ea11808022501811208a750/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:311ff15a0bae3714ffb603e6ba6dbfba4065ab60865d15a6ec544133bdb21099", size = 629104, upload-time = "2025-10-14T15:05:49.908Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/bd/fa9bb053192491b3867ba07d2343d9f2252e00811567d30ae8d0f78136fe/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:a916a2932da8f8ab582f242c065f5c81bed3462849ca79ee357dd9551b0e9b01", size = 622112, upload-time = "2025-10-14T15:05:50.941Z" },
     { url = "https://files.pythonhosted.org/packages/ba/4c/a888c91e2e326872fa4705095d64acd8aa2fb9c1f7b9bd0588f33850516c/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:17ef139237dfced9da49fb7f2232c86ca9421f666d78c264c7ffca6601d154c3", size = 409611, upload-time = "2025-10-14T15:06:05.809Z" },
     { url = "https://files.pythonhosted.org/packages/1e/c7/5420d1943c8e3ce1a21c0a9330bcf7edafb6aa65d26b21dbb3267c9e8112/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:672b8adf25b1a0d35c96b5888b7b18699d27d4194bac8beeae75be4b7a3fc9b2", size = 396889, upload-time = "2025-10-14T15:06:07.035Z" },
     { url = "https://files.pythonhosted.org/packages/0c/e5/0072cef3804ce8d3aaddbfe7788aadff6b3d3f98a286fdbee9fd74ca59a7/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a13aea58bc2b90173bc69f2a90de8e282648939a00a602e1dc4ee23e26b66d", size = 451616, upload-time = "2025-10-14T15:06:08.072Z" },
@@ -7853,14 +7609,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" },
 ]
 
+[[package]]
+name = "weasel"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cloudpathlib", marker = "python_full_version >= '3.11'" },
+    { name = "confection", marker = "python_full_version >= '3.11'" },
+    { name = "httpx", marker = "python_full_version >= '3.11'" },
+    { name = "packaging", marker = "python_full_version >= '3.11'" },
+    { name = "pydantic", marker = "python_full_version >= '3.11'" },
+    { name = "smart-open", marker = "python_full_version >= '3.11'" },
+    { name = "srsly", marker = "python_full_version >= '3.11'" },
+    { name = "typer", marker = "python_full_version >= '3.11'" },
+    { name = "wasabi", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/e5/e272bb9a045105a1fdf4b798d8086f5932a178f4d738f17a74f5c9e0ae9a/weasel-1.0.0.tar.gz", hash = "sha256:7b129b44c90cc543b760532974ca1e4eb30dad2aa2026f57bdce66354ae610fc", size = 38682, upload-time = "2026-03-20T08:10:25.266Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0a/07/57ebf7a6798b016c064bd0ca81b4c6a99daa4dc377b898bc7b41eb6b5af0/weasel-1.0.0-py3-none-any.whl", hash = "sha256:89518acee027f49d743126c3502d35e6dd14f5768be5c37c9af47c171b6005cc", size = 50713, upload-time = "2026-03-20T08:10:23.637Z" },
+]
+
 [[package]]
 name = "webdataset"
 version = "1.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "braceexpand" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or sys_platform == 'linux'" },
-    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and platform_machine != 'x86_64') or (python_full_version < '3.11' and sys_platform != 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and platform_machine != 'x86_64') or (python_full_version >= '3.11' and sys_platform != 'linux')" },
     { name = "pyyaml" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5a/3a/68800d92e065cf4750ebecf973b13979c0c929b439e1293012938862038d/webdataset-1.0.2.tar.gz", hash = "sha256:7f0498be827cfa46cc5430a58768a24e2c6a410676a61be1838f53d61afdaab4", size = 80090, upload-time = "2025-06-19T23:26:21.945Z" }
@@ -7927,6 +7703,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" },
 ]
 
+[[package]]
+name = "werkzeug"
+version = "3.1.8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markupsafe" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852, upload-time = "2026-04-02T18:49:14.268Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/8c/2e650f2afeb7ee576912636c23ddb621c91ac6a98e66dc8d29c3c69446e1/werkzeug-3.1.8-py3-none-any.whl", hash = "sha256:63a77fb8892bf28ebc3178683445222aa500e48ebad5ec77b0ad80f8726b1f50", size = 226459, upload-time = "2026-04-02T18:49:12.72Z" },
+]
+
 [[package]]
 name = "win32-setctime"
 version = "1.2.0"
@@ -7982,29 +7770,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" },
     { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" },
     { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" },
-    { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload-time = "2025-08-12T05:51:49.864Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload-time = "2025-08-12T05:51:38.935Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload-time = "2025-08-12T05:51:59.365Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload-time = "2025-08-12T05:52:40.965Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload-time = "2025-08-12T05:52:20.326Z" },
-    { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload-time = "2025-08-12T05:52:21.581Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload-time = "2025-08-12T05:52:43.043Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload-time = "2025-08-12T05:53:12.605Z" },
-    { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload-time = "2025-08-12T05:53:11.106Z" },
-    { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload-time = "2025-08-12T05:52:56.531Z" },
-    { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload-time = "2025-08-12T05:51:51.109Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload-time = "2025-08-12T05:51:39.912Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload-time = "2025-08-12T05:52:00.693Z" },
-    { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload-time = "2025-08-12T05:52:44.521Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" },
-    { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" },
-    { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload-time = "2025-08-12T05:53:15.214Z" },
-    { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload-time = "2025-08-12T05:53:14.178Z" },
-    { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload-time = "2025-08-12T05:52:57.784Z" },
     { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
 ]
 
+[[package]]
+name = "xmltodict"
+version = "1.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/70/80f3b7c10d2630aa66414bf23d210386700aa390547278c789afa994fd7e/xmltodict-1.0.4.tar.gz", hash = "sha256:6d94c9f834dd9e44514162799d344d815a3a4faec913717a9ecbfa5be1bb8e61", size = 26124, upload-time = "2026-02-22T02:21:22.074Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/38/34/98a2f52245f4d47be93b580dae5f9861ef58977d73a79eb47c58f1ad1f3a/xmltodict-1.0.4-py3-none-any.whl", hash = "sha256:a4a00d300b0e1c59fc2bfccb53d7b2e88c32f200df138a0dd2229f842497026a", size = 13580, upload-time = "2026-02-22T02:21:21.039Z" },
+]
+
 [[package]]
 name = "xxhash"
 version = "3.6.0"
@@ -8086,36 +7863,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/19/fa/0172e350361d61febcea941b0cc541d6e6c8d65d153e85f850a7b256ff8a/xxhash-3.6.0-cp313-cp313t-win32.whl", hash = "sha256:1244460adc3a9be84731d72b8e80625788e5815b68da3da8b83f78115a40a7ec", size = 30916, upload-time = "2025-10-02T14:35:35.107Z" },
     { url = "https://files.pythonhosted.org/packages/ad/e6/e8cf858a2b19d6d45820f072eff1bea413910592ff17157cabc5f1227a16/xxhash-3.6.0-cp313-cp313t-win_amd64.whl", hash = "sha256:b1e420ef35c503869c4064f4a2f2b08ad6431ab7b229a05cce39d74268bca6b8", size = 31799, upload-time = "2025-10-02T14:35:36.165Z" },
     { url = "https://files.pythonhosted.org/packages/56/15/064b197e855bfb7b343210e82490ae672f8bc7cdf3ddb02e92f64304ee8a/xxhash-3.6.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ec44b73a4220623235f67a996c862049f375df3b1052d9899f40a6382c32d746", size = 28044, upload-time = "2025-10-02T14:35:37.195Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/5e/0138bc4484ea9b897864d59fce9be9086030825bc778b76cb5a33a906d37/xxhash-3.6.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a40a3d35b204b7cc7643cbcf8c9976d818cb47befcfac8bbefec8038ac363f3e", size = 32754, upload-time = "2025-10-02T14:35:38.245Z" },
-    { url = "https://files.pythonhosted.org/packages/18/d7/5dac2eb2ec75fd771957a13e5dda560efb2176d5203f39502a5fc571f899/xxhash-3.6.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a54844be970d3fc22630b32d515e79a90d0a3ddb2644d8d7402e3c4c8da61405", size = 30846, upload-time = "2025-10-02T14:35:39.6Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/71/8bc5be2bb00deb5682e92e8da955ebe5fa982da13a69da5a40a4c8db12fb/xxhash-3.6.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:016e9190af8f0a4e3741343777710e3d5717427f175adfdc3e72508f59e2a7f3", size = 194343, upload-time = "2025-10-02T14:35:40.69Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/3b/52badfb2aecec2c377ddf1ae75f55db3ba2d321c5e164f14461c90837ef3/xxhash-3.6.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4f6f72232f849eb9d0141e2ebe2677ece15adfd0fa599bc058aad83c714bb2c6", size = 213074, upload-time = "2025-10-02T14:35:42.29Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/2b/ae46b4e9b92e537fa30d03dbc19cdae57ed407e9c26d163895e968e3de85/xxhash-3.6.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:63275a8aba7865e44b1813d2177e0f5ea7eadad3dd063a21f7cf9afdc7054063", size = 212388, upload-time = "2025-10-02T14:35:43.929Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/80/49f88d3afc724b4ac7fbd664c8452d6db51b49915be48c6982659e0e7942/xxhash-3.6.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cd01fa2aa00d8b017c97eb46b9a794fbdca53fc14f845f5a328c71254b0abb7", size = 445614, upload-time = "2025-10-02T14:35:45.216Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/ba/603ce3961e339413543d8cd44f21f2c80e2a7c5cfe692a7b1f2cccf58f3c/xxhash-3.6.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0226aa89035b62b6a86d3c68df4d7c1f47a342b8683da2b60cedcddb46c4d95b", size = 194024, upload-time = "2025-10-02T14:35:46.959Z" },
-    { url = "https://files.pythonhosted.org/packages/78/d1/8e225ff7113bf81545cfdcd79eef124a7b7064a0bba53605ff39590b95c2/xxhash-3.6.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c6e193e9f56e4ca4923c61238cdaced324f0feac782544eb4c6d55ad5cc99ddd", size = 210541, upload-time = "2025-10-02T14:35:48.301Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/58/0f89d149f0bad89def1a8dd38feb50ccdeb643d9797ec84707091d4cb494/xxhash-3.6.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9176dcaddf4ca963d4deb93866d739a343c01c969231dbe21680e13a5d1a5bf0", size = 198305, upload-time = "2025-10-02T14:35:49.584Z" },
-    { url = "https://files.pythonhosted.org/packages/11/38/5eab81580703c4df93feb5f32ff8fa7fe1e2c51c1f183ee4e48d4bb9d3d7/xxhash-3.6.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c1ce4009c97a752e682b897aa99aef84191077a9433eb237774689f14f8ec152", size = 210848, upload-time = "2025-10-02T14:35:50.877Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/6b/953dc4b05c3ce678abca756416e4c130d2382f877a9c30a20d08ee6a77c0/xxhash-3.6.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:8cb2f4f679b01513b7adbb9b1b2f0f9cdc31b70007eaf9d59d0878809f385b11", size = 414142, upload-time = "2025-10-02T14:35:52.15Z" },
-    { url = "https://files.pythonhosted.org/packages/08/a9/238ec0d4e81a10eb5026d4a6972677cbc898ba6c8b9dbaec12ae001b1b35/xxhash-3.6.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:653a91d7c2ab54a92c19ccf43508b6a555440b9be1bc8be553376778be7f20b5", size = 191547, upload-time = "2025-10-02T14:35:53.547Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/ee/3cf8589e06c2164ac77c3bf0aa127012801128f1feebf2a079272da5737c/xxhash-3.6.0-cp314-cp314-win32.whl", hash = "sha256:a756fe893389483ee8c394d06b5ab765d96e68fbbfe6fde7aa17e11f5720559f", size = 31214, upload-time = "2025-10-02T14:35:54.746Z" },
-    { url = "https://files.pythonhosted.org/packages/02/5d/a19552fbc6ad4cb54ff953c3908bbc095f4a921bc569433d791f755186f1/xxhash-3.6.0-cp314-cp314-win_amd64.whl", hash = "sha256:39be8e4e142550ef69629c9cd71b88c90e9a5db703fecbcf265546d9536ca4ad", size = 32290, upload-time = "2025-10-02T14:35:55.791Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/11/dafa0643bc30442c887b55baf8e73353a344ee89c1901b5a5c54a6c17d39/xxhash-3.6.0-cp314-cp314-win_arm64.whl", hash = "sha256:25915e6000338999236f1eb68a02a32c3275ac338628a7eaa5a269c401995679", size = 28795, upload-time = "2025-10-02T14:35:57.162Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/db/0e99732ed7f64182aef4a6fb145e1a295558deec2a746265dcdec12d191e/xxhash-3.6.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c5294f596a9017ca5a3e3f8884c00b91ab2ad2933cf288f4923c3fd4346cf3d4", size = 32955, upload-time = "2025-10-02T14:35:58.267Z" },
-    { url = "https://files.pythonhosted.org/packages/55/f4/2a7c3c68e564a099becfa44bb3d398810cc0ff6749b0d3cb8ccb93f23c14/xxhash-3.6.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1cf9dcc4ab9cff01dfbba78544297a3a01dafd60f3bde4e2bfd016cf7e4ddc67", size = 31072, upload-time = "2025-10-02T14:35:59.382Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/d9/72a29cddc7250e8a5819dad5d466facb5dc4c802ce120645630149127e73/xxhash-3.6.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:01262da8798422d0685f7cef03b2bd3f4f46511b02830861df548d7def4402ad", size = 196579, upload-time = "2025-10-02T14:36:00.838Z" },
-    { url = "https://files.pythonhosted.org/packages/63/93/b21590e1e381040e2ca305a884d89e1c345b347404f7780f07f2cdd47ef4/xxhash-3.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:51a73fb7cb3a3ead9f7a8b583ffd9b8038e277cdb8cb87cf890e88b3456afa0b", size = 215854, upload-time = "2025-10-02T14:36:02.207Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/b8/edab8a7d4fa14e924b29be877d54155dcbd8b80be85ea00d2be3413a9ed4/xxhash-3.6.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b9c6df83594f7df8f7f708ce5ebeacfc69f72c9fbaaababf6cf4758eaada0c9b", size = 214965, upload-time = "2025-10-02T14:36:03.507Z" },
-    { url = "https://files.pythonhosted.org/packages/27/67/dfa980ac7f0d509d54ea0d5a486d2bb4b80c3f1bb22b66e6a05d3efaf6c0/xxhash-3.6.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:627f0af069b0ea56f312fd5189001c24578868643203bca1abbc2c52d3a6f3ca", size = 448484, upload-time = "2025-10-02T14:36:04.828Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/63/8ffc2cc97e811c0ca5d00ab36604b3ea6f4254f20b7bc658ca825ce6c954/xxhash-3.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aa912c62f842dfd013c5f21a642c9c10cd9f4c4e943e0af83618b4a404d9091a", size = 196162, upload-time = "2025-10-02T14:36:06.182Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/77/07f0e7a3edd11a6097e990f6e5b815b6592459cb16dae990d967693e6ea9/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:b465afd7909db30168ab62afe40b2fcf79eedc0b89a6c0ab3123515dc0df8b99", size = 213007, upload-time = "2025-10-02T14:36:07.733Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/d8/bc5fa0d152837117eb0bef6f83f956c509332ce133c91c63ce07ee7c4873/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:a881851cf38b0a70e7c4d3ce81fc7afd86fbc2a024f4cfb2a97cf49ce04b75d3", size = 200956, upload-time = "2025-10-02T14:36:09.106Z" },
-    { url = "https://files.pythonhosted.org/packages/26/a5/d749334130de9411783873e9b98ecc46688dad5db64ca6e04b02acc8b473/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9b3222c686a919a0f3253cfc12bb118b8b103506612253b5baeaac10d8027cf6", size = 213401, upload-time = "2025-10-02T14:36:10.585Z" },
-    { url = "https://files.pythonhosted.org/packages/89/72/abed959c956a4bfc72b58c0384bb7940663c678127538634d896b1195c10/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:c5aa639bc113e9286137cec8fadc20e9cd732b2cc385c0b7fa673b84fc1f2a93", size = 417083, upload-time = "2025-10-02T14:36:12.276Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/b3/62fd2b586283b7d7d665fb98e266decadf31f058f1cf6c478741f68af0cb/xxhash-3.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5c1343d49ac102799905e115aee590183c3921d475356cb24b4de29a4bc56518", size = 193913, upload-time = "2025-10-02T14:36:14.025Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" },
-    { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" },
     { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" },
     { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" },
     { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" },
@@ -8214,41 +7961,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/e5/11f140a58bf4c6ad7aca69a892bff0ee638c31bea4206748fc0df4ebcb3a/yarl-1.22.0-cp313-cp313t-win32.whl", hash = "sha256:1834bb90991cc2999f10f97f5f01317f99b143284766d197e43cd5b45eb18d03", size = 86943, upload-time = "2025-10-06T14:11:10.284Z" },
     { url = "https://files.pythonhosted.org/packages/31/74/8b74bae38ed7fe6793d0c15a0c8207bbb819cf287788459e5ed230996cdd/yarl-1.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff86011bd159a9d2dfc89c34cfd8aff12875980e3bd6a39ff097887520e60249", size = 93715, upload-time = "2025-10-06T14:11:11.739Z" },
     { url = "https://files.pythonhosted.org/packages/69/66/991858aa4b5892d57aef7ee1ba6b4d01ec3b7eb3060795d34090a3ca3278/yarl-1.22.0-cp313-cp313t-win_arm64.whl", hash = "sha256:7861058d0582b847bc4e3a4a4c46828a410bca738673f35a29ba3ca5db0b473b", size = 83857, upload-time = "2025-10-06T14:11:13.586Z" },
-    { url = "https://files.pythonhosted.org/packages/46/b3/e20ef504049f1a1c54a814b4b9bed96d1ac0e0610c3b4da178f87209db05/yarl-1.22.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:34b36c2c57124530884d89d50ed2c1478697ad7473efd59cfd479945c95650e4", size = 140520, upload-time = "2025-10-06T14:11:15.465Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/04/3532d990fdbab02e5ede063676b5c4260e7f3abea2151099c2aa745acc4c/yarl-1.22.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:0dd9a702591ca2e543631c2a017e4a547e38a5c0f29eece37d9097e04a7ac683", size = 93504, upload-time = "2025-10-06T14:11:17.106Z" },
-    { url = "https://files.pythonhosted.org/packages/11/63/ff458113c5c2dac9a9719ac68ee7c947cb621432bcf28c9972b1c0e83938/yarl-1.22.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:594fcab1032e2d2cc3321bb2e51271e7cd2b516c7d9aee780ece81b07ff8244b", size = 94282, upload-time = "2025-10-06T14:11:19.064Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/bc/315a56aca762d44a6aaaf7ad253f04d996cb6b27bad34410f82d76ea8038/yarl-1.22.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3d7a87a78d46a2e3d5b72587ac14b4c16952dd0887dbb051451eceac774411e", size = 372080, upload-time = "2025-10-06T14:11:20.996Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/3f/08e9b826ec2e099ea6e7c69a61272f4f6da62cb5b1b63590bb80ca2e4a40/yarl-1.22.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:852863707010316c973162e703bddabec35e8757e67fcb8ad58829de1ebc8590", size = 338696, upload-time = "2025-10-06T14:11:22.847Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/9f/90360108e3b32bd76789088e99538febfea24a102380ae73827f62073543/yarl-1.22.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:131a085a53bfe839a477c0845acf21efc77457ba2bcf5899618136d64f3303a2", size = 387121, upload-time = "2025-10-06T14:11:24.889Z" },
-    { url = "https://files.pythonhosted.org/packages/98/92/ab8d4657bd5b46a38094cfaea498f18bb70ce6b63508fd7e909bd1f93066/yarl-1.22.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:078a8aefd263f4d4f923a9677b942b445a2be970ca24548a8102689a3a8ab8da", size = 394080, upload-time = "2025-10-06T14:11:27.307Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/e7/d8c5a7752fef68205296201f8ec2bf718f5c805a7a7e9880576c67600658/yarl-1.22.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bca03b91c323036913993ff5c738d0842fc9c60c4648e5c8d98331526df89784", size = 372661, upload-time = "2025-10-06T14:11:29.387Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/2e/f4d26183c8db0bb82d491b072f3127fb8c381a6206a3a56332714b79b751/yarl-1.22.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:68986a61557d37bb90d3051a45b91fa3d5c516d177dfc6dd6f2f436a07ff2b6b", size = 364645, upload-time = "2025-10-06T14:11:31.423Z" },
-    { url = "https://files.pythonhosted.org/packages/80/7c/428e5812e6b87cd00ee8e898328a62c95825bf37c7fa87f0b6bb2ad31304/yarl-1.22.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:4792b262d585ff0dff6bcb787f8492e40698443ec982a3568c2096433660c694", size = 355361, upload-time = "2025-10-06T14:11:33.055Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/2a/249405fd26776f8b13c067378ef4d7dd49c9098d1b6457cdd152a99e96a9/yarl-1.22.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ebd4549b108d732dba1d4ace67614b9545b21ece30937a63a65dd34efa19732d", size = 381451, upload-time = "2025-10-06T14:11:35.136Z" },
-    { url = "https://files.pythonhosted.org/packages/67/a8/fb6b1adbe98cf1e2dd9fad71003d3a63a1bc22459c6e15f5714eb9323b93/yarl-1.22.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f87ac53513d22240c7d59203f25cc3beac1e574c6cd681bbfd321987b69f95fd", size = 383814, upload-time = "2025-10-06T14:11:37.094Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/f9/3aa2c0e480fb73e872ae2814c43bc1e734740bb0d54e8cb2a95925f98131/yarl-1.22.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:22b029f2881599e2f1b06f8f1db2ee63bd309e2293ba2d566e008ba12778b8da", size = 370799, upload-time = "2025-10-06T14:11:38.83Z" },
-    { url = "https://files.pythonhosted.org/packages/50/3c/af9dba3b8b5eeb302f36f16f92791f3ea62e3f47763406abf6d5a4a3333b/yarl-1.22.0-cp314-cp314-win32.whl", hash = "sha256:6a635ea45ba4ea8238463b4f7d0e721bad669f80878b7bfd1f89266e2ae63da2", size = 82990, upload-time = "2025-10-06T14:11:40.624Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/30/ac3a0c5bdc1d6efd1b41fa24d4897a4329b3b1e98de9449679dd327af4f0/yarl-1.22.0-cp314-cp314-win_amd64.whl", hash = "sha256:0d6e6885777af0f110b0e5d7e5dda8b704efed3894da26220b7f3d887b839a79", size = 88292, upload-time = "2025-10-06T14:11:42.578Z" },
-    { url = "https://files.pythonhosted.org/packages/df/0a/227ab4ff5b998a1b7410abc7b46c9b7a26b0ca9e86c34ba4b8d8bc7c63d5/yarl-1.22.0-cp314-cp314-win_arm64.whl", hash = "sha256:8218f4e98d3c10d683584cb40f0424f4b9fd6e95610232dd75e13743b070ee33", size = 82888, upload-time = "2025-10-06T14:11:44.863Z" },
-    { url = "https://files.pythonhosted.org/packages/06/5e/a15eb13db90abd87dfbefb9760c0f3f257ac42a5cac7e75dbc23bed97a9f/yarl-1.22.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45c2842ff0e0d1b35a6bf1cd6c690939dacb617a70827f715232b2e0494d55d1", size = 146223, upload-time = "2025-10-06T14:11:46.796Z" },
-    { url = "https://files.pythonhosted.org/packages/18/82/9665c61910d4d84f41a5bf6837597c89e665fa88aa4941080704645932a9/yarl-1.22.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d947071e6ebcf2e2bee8fce76e10faca8f7a14808ca36a910263acaacef08eca", size = 95981, upload-time = "2025-10-06T14:11:48.845Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/9a/2f65743589809af4d0a6d3aa749343c4b5f4c380cc24a8e94a3c6625a808/yarl-1.22.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:334b8721303e61b00019474cc103bdac3d7b1f65e91f0bfedeec2d56dfe74b53", size = 97303, upload-time = "2025-10-06T14:11:50.897Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/ab/5b13d3e157505c43c3b43b5a776cbf7b24a02bc4cccc40314771197e3508/yarl-1.22.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e7ce67c34138a058fd092f67d07a72b8e31ff0c9236e751957465a24b28910c", size = 361820, upload-time = "2025-10-06T14:11:52.549Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/76/242a5ef4677615cf95330cfc1b4610e78184400699bdda0acb897ef5e49a/yarl-1.22.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d77e1b2c6d04711478cb1c4ab90db07f1609ccf06a287d5607fcd90dc9863acf", size = 323203, upload-time = "2025-10-06T14:11:54.225Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/96/475509110d3f0153b43d06164cf4195c64d16999e0c7e2d8a099adcd6907/yarl-1.22.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4647674b6150d2cae088fc07de2738a84b8bcedebef29802cf0b0a82ab6face", size = 363173, upload-time = "2025-10-06T14:11:56.069Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/66/59db471aecfbd559a1fd48aedd954435558cd98c7d0da8b03cc6c140a32c/yarl-1.22.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efb07073be061c8f79d03d04139a80ba33cbd390ca8f0297aae9cce6411e4c6b", size = 373562, upload-time = "2025-10-06T14:11:58.783Z" },
-    { url = "https://files.pythonhosted.org/packages/03/1f/c5d94abc91557384719da10ff166b916107c1b45e4d0423a88457071dd88/yarl-1.22.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e51ac5435758ba97ad69617e13233da53908beccc6cfcd6c34bbed8dcbede486", size = 339828, upload-time = "2025-10-06T14:12:00.686Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/97/aa6a143d3afba17b6465733681c70cf175af89f76ec8d9286e08437a7454/yarl-1.22.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33e32a0dd0c8205efa8e83d04fc9f19313772b78522d1bdc7d9aed706bfd6138", size = 347551, upload-time = "2025-10-06T14:12:02.628Z" },
-    { url = "https://files.pythonhosted.org/packages/43/3c/45a2b6d80195959239a7b2a8810506d4eea5487dce61c2a3393e7fc3c52e/yarl-1.22.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:bf4a21e58b9cde0e401e683ebd00f6ed30a06d14e93f7c8fd059f8b6e8f87b6a", size = 334512, upload-time = "2025-10-06T14:12:04.871Z" },
-    { url = "https://files.pythonhosted.org/packages/86/a0/c2ab48d74599c7c84cb104ebd799c5813de252bea0f360ffc29d270c2caa/yarl-1.22.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:e4b582bab49ac33c8deb97e058cd67c2c50dac0dd134874106d9c774fd272529", size = 352400, upload-time = "2025-10-06T14:12:06.624Z" },
-    { url = "https://files.pythonhosted.org/packages/32/75/f8919b2eafc929567d3d8411f72bdb1a2109c01caaab4ebfa5f8ffadc15b/yarl-1.22.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:0b5bcc1a9c4839e7e30b7b30dd47fe5e7e44fb7054ec29b5bb8d526aa1041093", size = 357140, upload-time = "2025-10-06T14:12:08.362Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/72/6a85bba382f22cf78add705d8c3731748397d986e197e53ecc7835e76de7/yarl-1.22.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c0232bce2170103ec23c454e54a57008a9a72b5d1c3105dc2496750da8cfa47c", size = 341473, upload-time = "2025-10-06T14:12:10.994Z" },
-    { url = "https://files.pythonhosted.org/packages/35/18/55e6011f7c044dc80b98893060773cefcfdbf60dfefb8cb2f58b9bacbd83/yarl-1.22.0-cp314-cp314t-win32.whl", hash = "sha256:8009b3173bcd637be650922ac455946197d858b3630b6d8787aa9e5c4564533e", size = 89056, upload-time = "2025-10-06T14:12:13.317Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/86/0f0dccb6e59a9e7f122c5afd43568b1d31b8ab7dda5f1b01fb5c7025c9a9/yarl-1.22.0-cp314-cp314t-win_amd64.whl", hash = "sha256:9fb17ea16e972c63d25d4a97f016d235c78dd2344820eb35bc034bc32012ee27", size = 96292, upload-time = "2025-10-06T14:12:15.398Z" },
-    { url = "https://files.pythonhosted.org/packages/48/b7/503c98092fb3b344a179579f55814b613c1fbb1c23b3ec14a7b008a66a6e/yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1", size = 85171, upload-time = "2025-10-06T14:12:16.935Z" },
     { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" },
 ]
 
+[[package]]
+name = "yq"
+version = "3.4.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "argcomplete" },
+    { name = "pyyaml" },
+    { name = "tomlkit" },
+    { name = "xmltodict" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/6a/eb9721ed0929d0f55d167c2222d288b529723afbef0a07ed7aa6cca72380/yq-3.4.3.tar.gz", hash = "sha256:ba586a1a6f30cf705b2f92206712df2281cd320280210e7b7b80adcb8f256e3b", size = 33214, upload-time = "2024-04-27T15:39:43.29Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/ba/d1b21f3e57469030bd6536b91bb28fedd2511d4e68b5a575f2bdb3a3dbb6/yq-3.4.3-py3-none-any.whl", hash = "sha256:547e34bc3caacce83665fd3429bf7c85f8e8b6b9aaee3f953db1ad716ff3434d", size = 18812, upload-time = "2024-04-27T15:39:41.652Z" },
+]
+
 [[package]]
 name = "zict"
 version = "3.0.0"