NVIDIA-NeMo · oliverholworthy · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/src/nemotron/recipes/embed/README.md b/src/nemotron/recipes/embed/README.md
@@ -356,7 +356,7 @@ entity = "my-team"
 # Local Docker execution profile
 [local-docker]
 executor = "docker"
-container_image = "nvcr.io/nvidia/pytorch:25.01-py3"
+container_image = "nvcr.io/nvidia/nemo-automodel:26.04"
 runtime = "nvidia"  # Enable GPU passthrough
 ipc_mode = "host"
 shm_size = "16g"
@@ -371,7 +371,7 @@ executor = "slurm"
 account = "my-account"
 partition = "interactive"
 batch_partition = "batch"
-container_image = "nvcr.io/nvidia/pytorch:25.01-py3"
+container_image = "nvcr.io/nvidia/nemo-automodel:26.04"
 tunnel = "ssh"
 host = "cluster.example.com"
 user = "username"
@@ -448,9 +448,12 @@ test_ratio: 0.1                # Test split (10%)
 **Stage 2: Finetune**
 ```yaml
 base_model: nvidia/llama-nemotron-embed-1b-v2
+trust_remote_code: true
 num_epochs: 3
 global_batch_size: 128
 learning_rate: 1.0e-5
+optimizer_backend: auto        # FusedAdam in Automodel container, FlashAdamW fallback
+flash_adamw_master_weight_bits: 32
 query_max_length: 512          # Max query tokens (check your base model's max sequence length)
 passage_max_length: 512        # Max passage tokens (check your base model's max sequence length)
 # attn_implementation: null    # Auto-detects flash_attention_2 if available, else sdpa

diff --git a/src/nemotron/recipes/embed/stage2_finetune/biencoder_base.yaml b/src/nemotron/recipes/embed/stage2_finetune/biencoder_base.yaml
@@ -14,10 +14,14 @@
 
 # Base configuration for biencoder fine-tuning using nemo-automodel
 # This file provides defaults that will be overridden by the Nemotron CLI
-# Source: https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/biencoder/llama3_2_1b_biencoder.yaml
+# Source: https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml
+
+recipe: TrainBiEncoderRecipe
 
 seed: 42
 
+temperature: 0.02
+
 step_scheduler:
   global_batch_size: 128
   local_batch_size: 4
@@ -30,39 +34,33 @@ dist_env:
   timeout_minutes: 1
 
 model:
-  _target_: nemo_automodel.components.models.biencoder.NeMoAutoModelBiencoder.from_pretrained
+  _target_: nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained
   pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
-  share_encoder: true
-  add_linear_pooler: false
-  out_dimension: 768
-  do_gradient_checkpointing: false
-  train_n_passages: 5
-  eval_negative_size: 4
   attn_implementation: sdpa  # Use PyTorch SDPA as fallback (works with/without flash-attn)
   pooling: avg
   l2_normalize: true
-  t: 0.02
   use_liger_kernel: true
   use_sdpa_patching: true
   torch_dtype: bfloat16
 
 tokenizer:
-  _target_: nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained
+  _target_: nemo_automodel.NeMoAutoTokenizer.from_pretrained
   pretrained_model_name_or_path: meta-llama/Llama-3.2-1B
+  add_eos_token: false
 
 dataloader:
   _target_: torchdata.stateful_dataloader.StatefulDataLoader
   dataset:
     _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset
+    model_type: bi_encoder
     data_dir_list:
       - training_datasets/nqsh_shuffled_50k.json
     data_type: train
-    train_n_passages: 5
-    eval_negative_size: 4
+    n_passages: 5
     seed: 42
     do_shuffle: true
   collate_fn:
-    _target_: nemo_automodel.components.datasets.llm.RetrievalBiencoderCollator
+    _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator
     q_max_len: 512
     p_max_len: 512
     query_prefix: "query:"
@@ -72,11 +70,12 @@ dataloader:
   num_workers: 0
 
 optimizer:
-  _target_: torch.optim.AdamW
+  _target_: transformer_engine.pytorch.optimizers.fused_adam.FusedAdam
   lr: 5.0e-6
   weight_decay: 0.01
-  betas: [0.9, 0.999]
-  eps: 1.0e-8
+  adam_w_mode: true
+  bias_correction: true
+  master_weights: true
 
 lr_scheduler:
   lr_warmup_steps: 1
@@ -88,6 +87,9 @@ checkpoint:
   model_save_format: safetensors
   save_consolidated: true
 
-distributed_config:
-  _target_: nemo_automodel.components.distributed.config.FSDP2Config
+distributed:
+  strategy: fsdp2
+  dp_size: none
+  tp_size: 1
+  cp_size: 1
   sequence_parallel: false
diff --git a/src/nemotron/recipes/embed/stage2_finetune/config/default.yaml b/src/nemotron/recipes/embed/stage2_finetune/config/default.yaml
@@ -14,10 +14,11 @@
 run:
   env:
     # Container image for remote execution (Docker/Slurm)
-    container: nvcr.io/nvidia/pytorch:25.12-py3
+    container: nvcr.io/nvidia/nemo-automodel:26.04
 
 # Base embedding model to fine-tune
 base_model: nvidia/llama-nemotron-embed-1b-v2
+trust_remote_code: true
 
 # Path to training data (output from stage1_data_prep)
 train_data_path: ./output/embed/stage1_data_prep/train_mined.automodel_unrolled.json
@@ -34,6 +35,12 @@ lr_warmup_steps: 5
 lr_decay_style: cosine  # cosine maintains higher LR longer than linear
 weight_decay: 0.01
 
+# Optimizer
+# auto uses Transformer Engine FusedAdam when available in the Automodel
+# container, otherwise FlashAdamW with bf16 model parameters.
+optimizer_backend: auto
+flash_adamw_master_weight_bits: 32
+
 # Model architecture
 # attn_implementation: null  # Auto-detects: flash_attention_2 if available, else sdpa
 train_n_passages: 5  # 1 positive + 4 negatives

diff --git a/src/nemotron/recipes/embed/stage2_finetune/pyproject.toml b/src/nemotron/recipes/embed/stage2_finetune/pyproject.toml
@@ -3,15 +3,12 @@ name = "recipe-runner"
 version = "0.0.0"
 requires-python = ">=3.12,<3.13"
 dependencies = [
-  "nemo-automodel @ git+https://github.com/NVIDIA-NeMo/Automodel.git@ecd7cb4297ae06990c94b7581f34c03ce8ed5488",
+  "nemo-automodel==0.4.0",
   # Needed by Nemotron library (added via --with flag)
   "omegaconf>=2.3.0",
   "pydantic-settings>=2.0.0",
   # Local GPU torch wheel, sourced from PyTorch cu129 index on Linux below
   "torch>=2.10.0",
-  # Pin transformers <5.2 — nemo-automodel's check_model_inputs decorator
-  # is a ContextDecorator in 5.2.0 which breaks biencoder forward()
-  "transformers>=5.0,<5.2",
 ]
 
 # Note: exclude-dependencies are injected dynamically by run_uv.py for Docker/Slurm

diff --git a/src/nemotron/recipes/embed/stage2_finetune/train.py b/src/nemotron/recipes/embed/stage2_finetune/train.py
@@ -4,8 +4,8 @@
 # schema = "1"
 # docs = "https://raw.githubusercontent.com/NVIDIA-NeMo/Nemotron/main/docs/runspec/v1/spec.md"
 # name = "embed/finetune"
-# image = "nvcr.io/nvidia/pytorch:25.12-py3"
-# setup = "PyTorch pre-installed. Stage dependencies resolved via UV at runtime."
+# image = "nvcr.io/nvidia/nemo-automodel:26.04"
+# setup = "NeMo Automodel pre-installed. Stage dependencies resolved via UV at runtime."
 #
 # [tool.runspec.run]
 # launch = "torchrun"
@@ -50,11 +50,12 @@
 
 from __future__ import annotations
 
+import importlib
 import json
 import os
 import sys
 from pathlib import Path
-from typing import Literal
+from typing import Any, Literal
 
 from pydantic import ConfigDict, Field
 
@@ -82,6 +83,10 @@ class FinetuneConfig(RecipeSettings):
         default="nvidia/llama-nemotron-embed-1b-v2",
         description="Base embedding model to fine-tune.",
     )
+    trust_remote_code: bool = Field(
+        default=True,
+        description="Allow Hugging Face custom model code. Required by the default Nemotron Embed model.",
+    )
 
     # Data paths
     train_data_path: Path = Field(
@@ -107,6 +112,14 @@ class FinetuneConfig(RecipeSettings):
         description="LR decay schedule (cosine, linear).",
     )
     weight_decay: float = Field(default=0.01, ge=0, description="Weight decay for optimizer.")
+    optimizer_backend: Literal["auto", "fused_adam", "flash_adamw"] = Field(
+        default="auto",
+        description="Optimizer backend. 'auto' uses FusedAdam when available, otherwise FlashAdamW.",
+    )
+    flash_adamw_master_weight_bits: Literal[24, 32] = Field(
+        default=32,
+        description="Effective master-weight precision for FlashAdamW when Transformer Engine is unavailable.",
+    )
 
     # Model architecture
     attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] | None = Field(
@@ -213,6 +226,74 @@ def _auto_scale_hyperparams(
     return global_batch_size, num_epochs, checkpoint_every_steps, val_every_steps
 
 
+def _can_import_fused_adam() -> tuple[bool, str | None]:
+    """Return whether Transformer Engine FusedAdam is importable."""
+    try:
+        importlib.import_module("transformer_engine.pytorch.optimizers.fused_adam")
+    except Exception as e:
+        return False, str(e)
+    return True, None
+
+
+def _can_import_flash_adamw() -> tuple[bool, str | None]:
+    """Return whether FlashAdamW is importable."""
+    try:
+        importlib.import_module("flashoptim")
+    except Exception as e:
+        return False, str(e)
+    return True, None
+
+
+def _load_automodel_config(cfg: FinetuneConfig, config_node_cls: type) -> tuple[Any, str]:
+    """Load Automodel YAML after choosing an optimizer that is importable here."""
+    import yaml
+
+    base_config_path = STAGE_PATH / "biencoder_base.yaml"
+    with open(base_config_path) as f:
+        raw_config = yaml.safe_load(f)
+
+    te_available, te_error = _can_import_fused_adam()
+    flash_available, flash_error = _can_import_flash_adamw()
+    optimizer_backend = cfg.optimizer_backend
+    if optimizer_backend == "auto":
+        optimizer_backend = "fused_adam" if te_available else "flash_adamw"
+
+    if optimizer_backend == "fused_adam":
+        if not te_available:
+            print("Error: optimizer_backend=fused_adam requires Transformer Engine.", file=sys.stderr)
+            if te_error:
+                print(f"  Import error: {te_error}", file=sys.stderr)
+            print(
+                "  Use optimizer_backend=flash_adamw for local runs without Transformer Engine.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+    elif optimizer_backend == "flash_adamw":
+        if not flash_available:
+            print("Error: optimizer_backend=flash_adamw requires flashoptim.", file=sys.stderr)
+            if flash_error:
+                print(f"  Import error: {flash_error}", file=sys.stderr)
+            print(
+                "  Install flashoptim, or run in an environment with Transformer Engine FusedAdam.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        raw_config["optimizer"] = {
+            "_target_": "flashoptim.FlashAdamW",
+            "lr": raw_config.get("optimizer", {}).get("lr", cfg.learning_rate),
+            "weight_decay": raw_config.get("optimizer", {}).get("weight_decay", cfg.weight_decay),
+            "betas": [0.9, 0.999],
+            "eps": 1.0e-8,
+            "quantize": False,
+            "compress_state_dict": False,
+            "master_weight_bits": cfg.flash_adamw_master_weight_bits,
+            "fused": True,
+        }
+        raw_config.setdefault("model", {})["torch_dtype"] = "bfloat16"
+
+    return config_node_cls(raw_config), optimizer_backend
+
+
 def run_finetune(cfg: FinetuneConfig) -> Path:
     """Run embedding model fine-tuning using nemo-automodel.
 
@@ -278,22 +359,32 @@ def run_finetune(cfg: FinetuneConfig) -> Path:
 
     # Import nemo-automodel components
     try:
-        from nemo_automodel.components.config.loader import load_yaml_config
-        from nemo_automodel.recipes.biencoder import TrainBiencoderRecipe
+        from nemo_automodel.components.config.loader import ConfigNode
+        from nemo_automodel.recipes.retrieval import TrainBiEncoderRecipe
     except ImportError as e:
         print("Error: Failed to import nemo-automodel. Is it installed?", file=sys.stderr)
         print("  Install with: pip install nemo-automodel", file=sys.stderr)
         print(f"  Error: {e}", file=sys.stderr)
         sys.exit(1)
 
-    # Load base config from nemo-automodel defaults
-    base_config_path = STAGE_PATH / "biencoder_base.yaml"
-    automodel_cfg = load_yaml_config(str(base_config_path))
+    # Load base config from nemo-automodel defaults. ConfigNode resolves _target_
+    # imports during construction, so optimizer selection must happen on raw YAML.
+    automodel_cfg, optimizer_backend = _load_automodel_config(cfg, ConfigNode)
+    optimizer_detail = optimizer_backend
+    if optimizer_backend == "flash_adamw":
+        optimizer_detail = (
+            f"{optimizer_backend} "
+            f"(bf16 model, {cfg.flash_adamw_master_weight_bits}-bit master weights)"
+        )
+    print(f"Optimizer:      {optimizer_detail}")
+    print()
 
     # Apply overrides from our config
     # Model settings
     automodel_cfg.model.pretrained_model_name_or_path = cfg.base_model
     automodel_cfg.tokenizer.pretrained_model_name_or_path = cfg.base_model
+    automodel_cfg.model.trust_remote_code = cfg.trust_remote_code
+    automodel_cfg.tokenizer.trust_remote_code = cfg.trust_remote_code
     # Auto-detect attention implementation if not explicitly set
     if cfg.attn_implementation is not None:
         attn_impl = cfg.attn_implementation
@@ -308,7 +399,7 @@ def run_finetune(cfg: FinetuneConfig) -> Path:
 
     # Data settings
     automodel_cfg.dataloader.dataset.data_dir_list = [str(cfg.train_data_path)]
-    automodel_cfg.dataloader.dataset.train_n_passages = cfg.train_n_passages
+    automodel_cfg.dataloader.dataset.n_passages = cfg.train_n_passages
     automodel_cfg.dataloader.collate_fn.q_max_len = cfg.query_max_length
     automodel_cfg.dataloader.collate_fn.p_max_len = cfg.passage_max_length
     automodel_cfg.dataloader.collate_fn.query_prefix = cfg.query_prefix
@@ -330,13 +421,13 @@ def run_finetune(cfg: FinetuneConfig) -> Path:
     # Model architecture
     automodel_cfg.model.pooling = cfg.pooling
     automodel_cfg.model.l2_normalize = cfg.l2_normalize
-    automodel_cfg.model.t = cfg.temperature
+    automodel_cfg.temperature = cfg.temperature
 
     # Checkpoint settings
     automodel_cfg.checkpoint.checkpoint_dir = str(cfg.checkpoint_dir)
 
-    # Create and run the biencoder recipe
-    recipe = TrainBiencoderRecipe(automodel_cfg)
+    # Create and run the bi-encoder recipe
+    recipe = TrainBiEncoderRecipe(automodel_cfg)
     recipe.setup()
     recipe.run_train_validation_loop()