From 391033b45b7e89cf1dbe8a90e9901bea36597ea9 Mon Sep 17 00:00:00 2001
From: Steve Han <sthan@nvidia.com>
Date: Fri, 24 Apr 2026 07:00:45 -0700
Subject: [PATCH] feat(embed): launch finetune with torchrun for multi-GPU
 support

Use torch.distributed.run with --nproc_per_node=gpu so training
automatically uses all available GPUs (works correctly with 1 GPU too).

Mirrors the rerank recipe change in 756e4f2.

Signed-off-by: Steve Han <sthan@nvidia.com>
Made-with: Cursor
---
 src/nemotron/cli/commands/embed/finetune.py         | 4 +++-
 src/nemotron/recipes/embed/stage2_finetune/train.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nemotron/cli/commands/embed/finetune.py b/src/nemotron/cli/commands/embed/finetune.py
index b9e1ca241..427c52c13 100644
--- a/src/nemotron/cli/commands/embed/finetune.py
+++ b/src/nemotron/cli/commands/embed/finetune.py
@@ -115,7 +115,9 @@ def _execute_uv_local(train_path: Path, passthrough: list[str]) -> None:
         uv_cmd, "run",
         "--with", str(repo_root),
         "--project", str(stage_dir),
-        "python", str(script_abs),
+        "python", "-m", "torch.distributed.run",
+        "--nproc_per_node=gpu",
+        str(script_abs),
         "--config", str(train_path),
         *passthrough,
     ]
diff --git a/src/nemotron/recipes/embed/stage2_finetune/train.py b/src/nemotron/recipes/embed/stage2_finetune/train.py
index b2e095d5f..88f1f59ac 100644
--- a/src/nemotron/recipes/embed/stage2_finetune/train.py
+++ b/src/nemotron/recipes/embed/stage2_finetune/train.py
@@ -8,7 +8,7 @@
 # setup = "PyTorch pre-installed. Stage dependencies resolved via UV at runtime."
 #
 # [tool.runspec.run]
-# launch = "direct"
+# launch = "torchrun"
 #
 # [tool.runspec.config]
 # dir = "./config"