Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/nemotron/recipes/embed/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ entity = "my-team"
# Local Docker execution profile
[local-docker]
executor = "docker"
container_image = "nvcr.io/nvidia/pytorch:25.01-py3"
container_image = "nvcr.io/nvidia/nemo-automodel:26.04"
runtime = "nvidia" # Enable GPU passthrough
ipc_mode = "host"
shm_size = "16g"
Expand All @@ -371,7 +371,7 @@ executor = "slurm"
account = "my-account"
partition = "interactive"
batch_partition = "batch"
container_image = "nvcr.io/nvidia/pytorch:25.01-py3"
container_image = "nvcr.io/nvidia/nemo-automodel:26.04"
tunnel = "ssh"
host = "cluster.example.com"
user = "username"
Expand Down Expand Up @@ -448,9 +448,12 @@ test_ratio: 0.1 # Test split (10%)
**Stage 2: Finetune**
```yaml
base_model: nvidia/llama-nemotron-embed-1b-v2
trust_remote_code: true
num_epochs: 3
global_batch_size: 128
learning_rate: 1.0e-5
optimizer_backend: auto # FusedAdam in Automodel container, FlashAdamW fallback
flash_adamw_master_weight_bits: 32
query_max_length: 512 # Max query tokens (check your base model's max sequence length)
passage_max_length: 512 # Max passage tokens (check your base model's max sequence length)
# attn_implementation: null # Auto-detects flash_attention_2 if available, else sdpa
Expand Down
38 changes: 20 additions & 18 deletions src/nemotron/recipes/embed/stage2_finetune/biencoder_base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@

# Base configuration for biencoder fine-tuning using nemo-automodel
# This file provides defaults that will be overridden by the Nemotron CLI
# Source: https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/biencoder/llama3_2_1b_biencoder.yaml
# Source: https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml

recipe: TrainBiEncoderRecipe

seed: 42

temperature: 0.02

step_scheduler:
global_batch_size: 128
local_batch_size: 4
Expand All @@ -30,39 +34,33 @@ dist_env:
timeout_minutes: 1

model:
_target_: nemo_automodel.components.models.biencoder.NeMoAutoModelBiencoder.from_pretrained
_target_: nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
share_encoder: true
add_linear_pooler: false
out_dimension: 768
do_gradient_checkpointing: false
train_n_passages: 5
eval_negative_size: 4
attn_implementation: sdpa # Use PyTorch SDPA as fallback (works with/without flash-attn)
pooling: avg
l2_normalize: true
t: 0.02
use_liger_kernel: true
use_sdpa_patching: true
torch_dtype: bfloat16

tokenizer:
_target_: nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained
_target_: nemo_automodel.NeMoAutoTokenizer.from_pretrained
pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
add_eos_token: false

dataloader:
_target_: torchdata.stateful_dataloader.StatefulDataLoader
dataset:
_target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset
model_type: bi_encoder
data_dir_list:
- training_datasets/nqsh_shuffled_50k.json
data_type: train
train_n_passages: 5
eval_negative_size: 4
n_passages: 5
seed: 42
do_shuffle: true
collate_fn:
_target_: nemo_automodel.components.datasets.llm.RetrievalBiencoderCollator
_target_: nemo_automodel.components.datasets.llm.BiEncoderCollator
q_max_len: 512
p_max_len: 512
query_prefix: "query:"
Expand All @@ -72,11 +70,12 @@ dataloader:
num_workers: 0

optimizer:
_target_: torch.optim.AdamW
_target_: transformer_engine.pytorch.optimizers.fused_adam.FusedAdam
lr: 5.0e-6
weight_decay: 0.01
betas: [0.9, 0.999]
eps: 1.0e-8
adam_w_mode: true
bias_correction: true
master_weights: true

lr_scheduler:
lr_warmup_steps: 1
Expand All @@ -88,6 +87,9 @@ checkpoint:
model_save_format: safetensors
save_consolidated: true

distributed_config:
_target_: nemo_automodel.components.distributed.config.FSDP2Config
distributed:
strategy: fsdp2
dp_size: none
tp_size: 1
cp_size: 1
sequence_parallel: false
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
run:
env:
# Container image for remote execution (Docker/Slurm)
container: nvcr.io/nvidia/pytorch:25.12-py3
container: nvcr.io/nvidia/nemo-automodel:26.04

# Base embedding model to fine-tune
base_model: nvidia/llama-nemotron-embed-1b-v2
trust_remote_code: true

# Path to training data (output from stage1_data_prep)
train_data_path: ./output/embed/stage1_data_prep/train_mined.automodel_unrolled.json
Expand All @@ -34,6 +35,12 @@ lr_warmup_steps: 5
lr_decay_style: cosine # cosine maintains higher LR longer than linear
weight_decay: 0.01

# Optimizer
# auto uses Transformer Engine FusedAdam when available in the Automodel
# container, otherwise FlashAdamW with bf16 model parameters.
optimizer_backend: auto
flash_adamw_master_weight_bits: 32

# Model architecture
# attn_implementation: null # Auto-detects: flash_attention_2 if available, else sdpa
train_n_passages: 5 # 1 positive + 4 negatives
Expand Down
5 changes: 1 addition & 4 deletions src/nemotron/recipes/embed/stage2_finetune/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@ name = "recipe-runner"
version = "0.0.0"
requires-python = ">=3.12,<3.13"
dependencies = [
"nemo-automodel @ git+https://github.com/NVIDIA-NeMo/Automodel.git@ecd7cb4297ae06990c94b7581f34c03ce8ed5488",
"nemo-automodel==0.4.0",
# Needed by Nemotron library (added via --with flag)
"omegaconf>=2.3.0",
"pydantic-settings>=2.0.0",
# Local GPU torch wheel, sourced from PyTorch cu129 index on Linux below
"torch>=2.10.0",
# Pin transformers <5.2 — nemo-automodel's check_model_inputs decorator
# is a ContextDecorator in 5.2.0 which breaks biencoder forward()
"transformers>=5.0,<5.2",
]

# Note: exclude-dependencies are injected dynamically by run_uv.py for Docker/Slurm
Expand Down
115 changes: 103 additions & 12 deletions src/nemotron/recipes/embed/stage2_finetune/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
# schema = "1"
# docs = "https://raw.githubusercontent.com/NVIDIA-NeMo/Nemotron/main/docs/runspec/v1/spec.md"
# name = "embed/finetune"
# image = "nvcr.io/nvidia/pytorch:25.12-py3"
# setup = "PyTorch pre-installed. Stage dependencies resolved via UV at runtime."
# image = "nvcr.io/nvidia/nemo-automodel:26.04"
# setup = "NeMo Automodel pre-installed. Stage dependencies resolved via UV at runtime."
#
# [tool.runspec.run]
# launch = "torchrun"
Expand Down Expand Up @@ -50,11 +50,12 @@

from __future__ import annotations

import importlib
import json
import os
import sys
from pathlib import Path
from typing import Literal
from typing import Any, Literal

from pydantic import ConfigDict, Field

Expand Down Expand Up @@ -82,6 +83,10 @@ class FinetuneConfig(RecipeSettings):
default="nvidia/llama-nemotron-embed-1b-v2",
description="Base embedding model to fine-tune.",
)
trust_remote_code: bool = Field(
default=True,
description="Allow Hugging Face custom model code. Required by the default Nemotron Embed model.",
)

# Data paths
train_data_path: Path = Field(
Expand All @@ -107,6 +112,14 @@ class FinetuneConfig(RecipeSettings):
description="LR decay schedule (cosine, linear).",
)
weight_decay: float = Field(default=0.01, ge=0, description="Weight decay for optimizer.")
optimizer_backend: Literal["auto", "fused_adam", "flash_adamw"] = Field(
default="auto",
description="Optimizer backend. 'auto' uses FusedAdam when available, otherwise FlashAdamW.",
)
flash_adamw_master_weight_bits: Literal[24, 32] = Field(
default=32,
description="Effective master-weight precision for FlashAdamW when Transformer Engine is unavailable.",
)

# Model architecture
attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] | None = Field(
Expand Down Expand Up @@ -213,6 +226,74 @@ def _auto_scale_hyperparams(
return global_batch_size, num_epochs, checkpoint_every_steps, val_every_steps


def _can_import_fused_adam() -> tuple[bool, str | None]:
"""Return whether Transformer Engine FusedAdam is importable."""
try:
importlib.import_module("transformer_engine.pytorch.optimizers.fused_adam")
except Exception as e:
return False, str(e)
return True, None


def _can_import_flash_adamw() -> tuple[bool, str | None]:
"""Return whether FlashAdamW is importable."""
try:
importlib.import_module("flashoptim")
except Exception as e:
return False, str(e)
return True, None


def _load_automodel_config(cfg: FinetuneConfig, config_node_cls: type) -> tuple[Any, str]:
"""Load Automodel YAML after choosing an optimizer that is importable here."""
import yaml

base_config_path = STAGE_PATH / "biencoder_base.yaml"
with open(base_config_path) as f:
raw_config = yaml.safe_load(f)

te_available, te_error = _can_import_fused_adam()
flash_available, flash_error = _can_import_flash_adamw()
optimizer_backend = cfg.optimizer_backend
if optimizer_backend == "auto":
optimizer_backend = "fused_adam" if te_available else "flash_adamw"

if optimizer_backend == "fused_adam":
if not te_available:
print("Error: optimizer_backend=fused_adam requires Transformer Engine.", file=sys.stderr)
if te_error:
print(f" Import error: {te_error}", file=sys.stderr)
print(
" Use optimizer_backend=flash_adamw for local runs without Transformer Engine.",
file=sys.stderr,
)
sys.exit(1)
elif optimizer_backend == "flash_adamw":
if not flash_available:
print("Error: optimizer_backend=flash_adamw requires flashoptim.", file=sys.stderr)
if flash_error:
print(f" Import error: {flash_error}", file=sys.stderr)
print(
" Install flashoptim, or run in an environment with Transformer Engine FusedAdam.",
file=sys.stderr,
)
sys.exit(1)
raw_config["optimizer"] = {
"_target_": "flashoptim.FlashAdamW",
"lr": raw_config.get("optimizer", {}).get("lr", cfg.learning_rate),
"weight_decay": raw_config.get("optimizer", {}).get("weight_decay", cfg.weight_decay),
"betas": [0.9, 0.999],
"eps": 1.0e-8,
"quantize": False,
"compress_state_dict": False,
"master_weight_bits": cfg.flash_adamw_master_weight_bits,
"fused": True,
}
raw_config.setdefault("model", {})["torch_dtype"] = "bfloat16"

return config_node_cls(raw_config), optimizer_backend


def run_finetune(cfg: FinetuneConfig) -> Path:
"""Run embedding model fine-tuning using nemo-automodel.

Expand Down Expand Up @@ -278,22 +359,32 @@ def run_finetune(cfg: FinetuneConfig) -> Path:

# Import nemo-automodel components
try:
from nemo_automodel.components.config.loader import load_yaml_config
from nemo_automodel.recipes.biencoder import TrainBiencoderRecipe
from nemo_automodel.components.config.loader import ConfigNode
from nemo_automodel.recipes.retrieval import TrainBiEncoderRecipe
except ImportError as e:
print("Error: Failed to import nemo-automodel. Is it installed?", file=sys.stderr)
print(" Install with: pip install nemo-automodel", file=sys.stderr)
print(f" Error: {e}", file=sys.stderr)
sys.exit(1)

# Load base config from nemo-automodel defaults
base_config_path = STAGE_PATH / "biencoder_base.yaml"
automodel_cfg = load_yaml_config(str(base_config_path))
# Load base config from nemo-automodel defaults. ConfigNode resolves _target_
# imports during construction, so optimizer selection must happen on raw YAML.
automodel_cfg, optimizer_backend = _load_automodel_config(cfg, ConfigNode)
optimizer_detail = optimizer_backend
if optimizer_backend == "flash_adamw":
optimizer_detail = (
f"{optimizer_backend} "
f"(bf16 model, {cfg.flash_adamw_master_weight_bits}-bit master weights)"
)
print(f"Optimizer: {optimizer_detail}")
print()

# Apply overrides from our config
# Model settings
automodel_cfg.model.pretrained_model_name_or_path = cfg.base_model
automodel_cfg.tokenizer.pretrained_model_name_or_path = cfg.base_model
automodel_cfg.model.trust_remote_code = cfg.trust_remote_code
automodel_cfg.tokenizer.trust_remote_code = cfg.trust_remote_code
# Auto-detect attention implementation if not explicitly set
if cfg.attn_implementation is not None:
attn_impl = cfg.attn_implementation
Expand All @@ -308,7 +399,7 @@ def run_finetune(cfg: FinetuneConfig) -> Path:

# Data settings
automodel_cfg.dataloader.dataset.data_dir_list = [str(cfg.train_data_path)]
automodel_cfg.dataloader.dataset.train_n_passages = cfg.train_n_passages
automodel_cfg.dataloader.dataset.n_passages = cfg.train_n_passages
automodel_cfg.dataloader.collate_fn.q_max_len = cfg.query_max_length
automodel_cfg.dataloader.collate_fn.p_max_len = cfg.passage_max_length
automodel_cfg.dataloader.collate_fn.query_prefix = cfg.query_prefix
Expand All @@ -330,13 +421,13 @@ def run_finetune(cfg: FinetuneConfig) -> Path:
# Model architecture
automodel_cfg.model.pooling = cfg.pooling
automodel_cfg.model.l2_normalize = cfg.l2_normalize
automodel_cfg.model.t = cfg.temperature
automodel_cfg.temperature = cfg.temperature

# Checkpoint settings
automodel_cfg.checkpoint.checkpoint_dir = str(cfg.checkpoint_dir)

# Create and run the biencoder recipe
recipe = TrainBiencoderRecipe(automodel_cfg)
# Create and run the bi-encoder recipe
recipe = TrainBiEncoderRecipe(automodel_cfg)
recipe.setup()
recipe.run_train_validation_loop()

Expand Down
Loading
Loading